blob: 3692da64122f8678fd65a73f1b44c99cc2fc8fcc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner27e2d1f2018-11-01 00:52:28 +010043#include "pycore_state.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner3d4226a2018-08-29 22:21:32 +0200321_Py_error_handler
322_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200323{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200325 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200326 }
327 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200328 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200329 }
330 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200331 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200332 }
333 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200334 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 }
336 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200337 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200340 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
Victor Stinner50149202015-09-22 00:26:54 +0200345 return _Py_ERROR_OTHER;
346}
347
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300348/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
349 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000351PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000352{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000353#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000355#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 /* This is actually an illegal character, so it should
357 not be passed to unichr. */
358 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000359#endif
360}
361
Victor Stinner910337b2011-10-03 03:20:16 +0200362#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200363int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100364_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200365{
Victor Stinner50fe3f82018-10-26 18:47:15 +0200366#define ASSERT(expr) _PyObject_ASSERT(op, (expr))
367
Victor Stinner910337b2011-10-03 03:20:16 +0200368 PyASCIIObject *ascii;
369 unsigned int kind;
370
Victor Stinner50fe3f82018-10-26 18:47:15 +0200371 ASSERT(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200372
373 ascii = (PyASCIIObject *)op;
374 kind = ascii->state.kind;
375
Victor Stinnera3b334d2011-10-03 13:53:37 +0200376 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200377 ASSERT(kind == PyUnicode_1BYTE_KIND);
378 ASSERT(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200381 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200382 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200383
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 if (ascii->state.compact == 1) {
385 data = compact + 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200386 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinner910337b2011-10-03 03:20:16 +0200387 || kind == PyUnicode_2BYTE_KIND
388 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200389 ASSERT(ascii->state.ascii == 0);
390 ASSERT(ascii->state.ready == 1);
391 ASSERT (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100392 }
393 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200394 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
395
396 data = unicode->data.any;
397 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200398 ASSERT(ascii->length == 0);
399 ASSERT(ascii->hash == -1);
400 ASSERT(ascii->state.compact == 0);
401 ASSERT(ascii->state.ascii == 0);
402 ASSERT(ascii->state.ready == 0);
403 ASSERT(ascii->state.interned == SSTATE_NOT_INTERNED);
404 ASSERT(ascii->wstr != NULL);
405 ASSERT(data == NULL);
406 ASSERT(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200407 }
408 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200409 ASSERT(kind == PyUnicode_1BYTE_KIND
Victor Stinnera41463c2011-10-04 01:05:08 +0200410 || kind == PyUnicode_2BYTE_KIND
411 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner50fe3f82018-10-26 18:47:15 +0200412 ASSERT(ascii->state.compact == 0);
413 ASSERT(ascii->state.ready == 1);
414 ASSERT(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200415 if (ascii->state.ascii) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200416 ASSERT (compact->utf8 == data);
417 ASSERT (compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200418 }
419 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200420 ASSERT (compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200421 }
422 }
423 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200424 if (
425#if SIZEOF_WCHAR_T == 2
426 kind == PyUnicode_2BYTE_KIND
427#else
428 kind == PyUnicode_4BYTE_KIND
429#endif
430 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200431 {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200432 ASSERT(ascii->wstr == data);
433 ASSERT(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200434 } else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200435 ASSERT(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200436 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200437
438 if (compact->utf8 == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200439 ASSERT(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200440 if (ascii->wstr == NULL)
Victor Stinner50fe3f82018-10-26 18:47:15 +0200441 ASSERT(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200442 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200443 /* check that the best kind is used */
444 if (check_content && kind != PyUnicode_WCHAR_KIND)
445 {
446 Py_ssize_t i;
447 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200448 void *data;
449 Py_UCS4 ch;
450
451 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200452 for (i=0; i < ascii->length; i++)
453 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200454 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200455 if (ch > maxchar)
456 maxchar = ch;
457 }
458 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100459 if (ascii->state.ascii == 0) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200460 ASSERT(maxchar >= 128);
461 ASSERT(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100462 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200463 else
Victor Stinner50fe3f82018-10-26 18:47:15 +0200464 ASSERT(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 }
Victor Stinner77faf692011-11-20 18:56:05 +0100466 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200467 ASSERT(maxchar >= 0x100);
468 ASSERT(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100469 }
470 else {
Victor Stinner50fe3f82018-10-26 18:47:15 +0200471 ASSERT(maxchar >= 0x10000);
472 ASSERT(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100473 }
Victor Stinner50fe3f82018-10-26 18:47:15 +0200474 ASSERT(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200475 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400476 return 1;
Victor Stinner50fe3f82018-10-26 18:47:15 +0200477
478#undef ASSERT
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400479}
Victor Stinner910337b2011-10-03 03:20:16 +0200480#endif
481
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100482static PyObject*
483unicode_result_wchar(PyObject *unicode)
484{
485#ifndef Py_DEBUG
486 Py_ssize_t len;
487
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 len = _PyUnicode_WSTR_LENGTH(unicode);
489 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200491 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100492 }
493
494 if (len == 1) {
495 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100496 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100497 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
498 Py_DECREF(unicode);
499 return latin1_char;
500 }
501 }
502
503 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200504 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100505 return NULL;
506 }
507#else
Victor Stinneraa771272012-10-04 02:32:58 +0200508 assert(Py_REFCNT(unicode) == 1);
509
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100510 /* don't make the result ready in debug mode to ensure that the caller
511 makes the string ready before using it */
512 assert(_PyUnicode_CheckConsistency(unicode, 1));
513#endif
514 return unicode;
515}
516
517static PyObject*
518unicode_result_ready(PyObject *unicode)
519{
520 Py_ssize_t length;
521
522 length = PyUnicode_GET_LENGTH(unicode);
523 if (length == 0) {
524 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100525 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200526 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100527 }
528 return unicode_empty;
529 }
530
531 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200532 void *data = PyUnicode_DATA(unicode);
533 int kind = PyUnicode_KIND(unicode);
534 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100535 if (ch < 256) {
536 PyObject *latin1_char = unicode_latin1[ch];
537 if (latin1_char != NULL) {
538 if (unicode != latin1_char) {
539 Py_INCREF(latin1_char);
540 Py_DECREF(unicode);
541 }
542 return latin1_char;
543 }
544 else {
545 assert(_PyUnicode_CheckConsistency(unicode, 1));
546 Py_INCREF(unicode);
547 unicode_latin1[ch] = unicode;
548 return unicode;
549 }
550 }
551 }
552
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 return unicode;
555}
556
557static PyObject*
558unicode_result(PyObject *unicode)
559{
560 assert(_PyUnicode_CHECK(unicode));
561 if (PyUnicode_IS_READY(unicode))
562 return unicode_result_ready(unicode);
563 else
564 return unicode_result_wchar(unicode);
565}
566
Victor Stinnerc4b49542011-12-11 22:44:26 +0100567static PyObject*
568unicode_result_unchanged(PyObject *unicode)
569{
570 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500571 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100572 return NULL;
573 Py_INCREF(unicode);
574 return unicode;
575 }
576 else
577 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100578 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100579}
580
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200581/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
582 ASCII, Latin1, UTF-8, etc. */
583static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200584backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200585 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
586{
Victor Stinnerad771582015-10-09 12:38:53 +0200587 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200588 Py_UCS4 ch;
589 enum PyUnicode_Kind kind;
590 void *data;
591
592 assert(PyUnicode_IS_READY(unicode));
593 kind = PyUnicode_KIND(unicode);
594 data = PyUnicode_DATA(unicode);
595
596 size = 0;
597 /* determine replacement size */
598 for (i = collstart; i < collend; ++i) {
599 Py_ssize_t incr;
600
601 ch = PyUnicode_READ(kind, data, i);
602 if (ch < 0x100)
603 incr = 2+2;
604 else if (ch < 0x10000)
605 incr = 2+4;
606 else {
607 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200608 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200609 }
610 if (size > PY_SSIZE_T_MAX - incr) {
611 PyErr_SetString(PyExc_OverflowError,
612 "encoded result is too long for a Python string");
613 return NULL;
614 }
615 size += incr;
616 }
617
Victor Stinnerad771582015-10-09 12:38:53 +0200618 str = _PyBytesWriter_Prepare(writer, str, size);
619 if (str == NULL)
620 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200621
622 /* generate replacement */
623 for (i = collstart; i < collend; ++i) {
624 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200625 *str++ = '\\';
626 if (ch >= 0x00010000) {
627 *str++ = 'U';
628 *str++ = Py_hexdigits[(ch>>28)&0xf];
629 *str++ = Py_hexdigits[(ch>>24)&0xf];
630 *str++ = Py_hexdigits[(ch>>20)&0xf];
631 *str++ = Py_hexdigits[(ch>>16)&0xf];
632 *str++ = Py_hexdigits[(ch>>12)&0xf];
633 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200634 }
Victor Stinner797485e2015-10-09 03:17:30 +0200635 else if (ch >= 0x100) {
636 *str++ = 'u';
637 *str++ = Py_hexdigits[(ch>>12)&0xf];
638 *str++ = Py_hexdigits[(ch>>8)&0xf];
639 }
640 else
641 *str++ = 'x';
642 *str++ = Py_hexdigits[(ch>>4)&0xf];
643 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200644 }
645 return str;
646}
647
648/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
649 ASCII, Latin1, UTF-8, etc. */
650static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200651xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200652 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
653{
Victor Stinnerad771582015-10-09 12:38:53 +0200654 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200655 Py_UCS4 ch;
656 enum PyUnicode_Kind kind;
657 void *data;
658
659 assert(PyUnicode_IS_READY(unicode));
660 kind = PyUnicode_KIND(unicode);
661 data = PyUnicode_DATA(unicode);
662
663 size = 0;
664 /* determine replacement size */
665 for (i = collstart; i < collend; ++i) {
666 Py_ssize_t incr;
667
668 ch = PyUnicode_READ(kind, data, i);
669 if (ch < 10)
670 incr = 2+1+1;
671 else if (ch < 100)
672 incr = 2+2+1;
673 else if (ch < 1000)
674 incr = 2+3+1;
675 else if (ch < 10000)
676 incr = 2+4+1;
677 else if (ch < 100000)
678 incr = 2+5+1;
679 else if (ch < 1000000)
680 incr = 2+6+1;
681 else {
682 assert(ch <= MAX_UNICODE);
683 incr = 2+7+1;
684 }
685 if (size > PY_SSIZE_T_MAX - incr) {
686 PyErr_SetString(PyExc_OverflowError,
687 "encoded result is too long for a Python string");
688 return NULL;
689 }
690 size += incr;
691 }
692
Victor Stinnerad771582015-10-09 12:38:53 +0200693 str = _PyBytesWriter_Prepare(writer, str, size);
694 if (str == NULL)
695 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696
697 /* generate replacement */
698 for (i = collstart; i < collend; ++i) {
699 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
700 }
701 return str;
702}
703
Thomas Wouters477c8d52006-05-27 19:21:47 +0000704/* --- Bloom Filters ----------------------------------------------------- */
705
706/* stuff to implement simple "bloom filters" for Unicode characters.
707 to keep things simple, we use a single bitmask, using the least 5
708 bits from each unicode characters as the bit index. */
709
710/* the linebreak mask is set up by Unicode_Init below */
711
Antoine Pitrouf068f942010-01-13 14:19:12 +0000712#if LONG_BIT >= 128
713#define BLOOM_WIDTH 128
714#elif LONG_BIT >= 64
715#define BLOOM_WIDTH 64
716#elif LONG_BIT >= 32
717#define BLOOM_WIDTH 32
718#else
719#error "LONG_BIT is smaller than 32"
720#endif
721
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722#define BLOOM_MASK unsigned long
723
Serhiy Storchaka05997252013-01-26 12:14:02 +0200724static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725
Antoine Pitrouf068f942010-01-13 14:19:12 +0000726#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000727
Benjamin Peterson29060642009-01-31 22:14:21 +0000728#define BLOOM_LINEBREAK(ch) \
729 ((ch) < 128U ? ascii_linebreak[(ch)] : \
730 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700732static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200733make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734{
Victor Stinnera85af502013-04-09 21:53:54 +0200735#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
736 do { \
737 TYPE *data = (TYPE *)PTR; \
738 TYPE *end = data + LEN; \
739 Py_UCS4 ch; \
740 for (; data != end; data++) { \
741 ch = *data; \
742 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
743 } \
744 break; \
745 } while (0)
746
Thomas Wouters477c8d52006-05-27 19:21:47 +0000747 /* calculate simple bloom-style bitmask for a given unicode string */
748
Antoine Pitrouf068f942010-01-13 14:19:12 +0000749 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000750
751 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200752 switch (kind) {
753 case PyUnicode_1BYTE_KIND:
754 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
755 break;
756 case PyUnicode_2BYTE_KIND:
757 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
758 break;
759 case PyUnicode_4BYTE_KIND:
760 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
761 break;
762 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700763 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200764 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000765 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200766
767#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768}
769
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300770static int
771ensure_unicode(PyObject *obj)
772{
773 if (!PyUnicode_Check(obj)) {
774 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200775 "must be str, not %.100s",
776 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300777 return -1;
778 }
779 return PyUnicode_READY(obj);
780}
781
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200782/* Compilation of templated routines */
783
784#include "stringlib/asciilib.h"
785#include "stringlib/fastsearch.h"
786#include "stringlib/partition.h"
787#include "stringlib/split.h"
788#include "stringlib/count.h"
789#include "stringlib/find.h"
790#include "stringlib/find_max_char.h"
791#include "stringlib/localeutil.h"
792#include "stringlib/undef.h"
793
794#include "stringlib/ucs1lib.h"
795#include "stringlib/fastsearch.h"
796#include "stringlib/partition.h"
797#include "stringlib/split.h"
798#include "stringlib/count.h"
799#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300800#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200801#include "stringlib/find_max_char.h"
802#include "stringlib/localeutil.h"
803#include "stringlib/undef.h"
804
805#include "stringlib/ucs2lib.h"
806#include "stringlib/fastsearch.h"
807#include "stringlib/partition.h"
808#include "stringlib/split.h"
809#include "stringlib/count.h"
810#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300811#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200812#include "stringlib/find_max_char.h"
813#include "stringlib/localeutil.h"
814#include "stringlib/undef.h"
815
816#include "stringlib/ucs4lib.h"
817#include "stringlib/fastsearch.h"
818#include "stringlib/partition.h"
819#include "stringlib/split.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300822#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200823#include "stringlib/find_max_char.h"
824#include "stringlib/localeutil.h"
825#include "stringlib/undef.h"
826
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200827#include "stringlib/unicodedefs.h"
828#include "stringlib/fastsearch.h"
829#include "stringlib/count.h"
830#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100831#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200832
Guido van Rossumd57fd912000-03-10 22:53:23 +0000833/* --- Unicode Object ----------------------------------------------------- */
834
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700835static inline Py_ssize_t
836findchar(const void *s, int kind,
837 Py_ssize_t size, Py_UCS4 ch,
838 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200840 switch (kind) {
841 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if ((Py_UCS1) ch != ch)
843 return -1;
844 if (direction > 0)
845 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
846 else
847 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200848 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200849 if ((Py_UCS2) ch != ch)
850 return -1;
851 if (direction > 0)
852 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
853 else
854 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200855 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200856 if (direction > 0)
857 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
858 else
859 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200860 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700861 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863}
864
Victor Stinnerafffce42012-10-03 23:03:17 +0200865#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000866/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200867 earlier.
868
869 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
870 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
871 invalid character in Unicode 6.0. */
872static void
873unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
874{
875 int kind = PyUnicode_KIND(unicode);
876 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
877 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
878 if (length <= old_length)
879 return;
880 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
881}
882#endif
883
Victor Stinnerfe226c02011-10-03 03:52:20 +0200884static PyObject*
885resize_compact(PyObject *unicode, Py_ssize_t length)
886{
887 Py_ssize_t char_size;
888 Py_ssize_t struct_size;
889 Py_ssize_t new_size;
890 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100891 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200892#ifdef Py_DEBUG
893 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
894#endif
895
Victor Stinner79891572012-05-03 13:43:07 +0200896 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100898 assert(PyUnicode_IS_COMPACT(unicode));
899
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200900 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100901 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 struct_size = sizeof(PyASCIIObject);
903 else
904 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200905 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200906
Victor Stinnerfe226c02011-10-03 03:52:20 +0200907 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
908 PyErr_NoMemory();
909 return NULL;
910 }
911 new_size = (struct_size + (length + 1) * char_size);
912
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200913 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
914 PyObject_DEL(_PyUnicode_UTF8(unicode));
915 _PyUnicode_UTF8(unicode) = NULL;
916 _PyUnicode_UTF8_LENGTH(unicode) = 0;
917 }
Victor Stinner84def372011-12-11 20:04:56 +0100918 _Py_DEC_REFTOTAL;
919 _Py_ForgetReference(unicode);
920
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300921 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100922 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100923 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 PyErr_NoMemory();
925 return NULL;
926 }
Victor Stinner84def372011-12-11 20:04:56 +0100927 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200928 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100929
Victor Stinnerfe226c02011-10-03 03:52:20 +0200930 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200931 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200932 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100933 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200934 _PyUnicode_WSTR_LENGTH(unicode) = length;
935 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
937 PyObject_DEL(_PyUnicode_WSTR(unicode));
938 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100939 if (!PyUnicode_IS_ASCII(unicode))
940 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100941 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200942#ifdef Py_DEBUG
943 unicode_fill_invalid(unicode, old_length);
944#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200945 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
946 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200947 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200948 return unicode;
949}
950
Alexander Belopolsky40018472011-02-26 01:02:56 +0000951static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200952resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000953{
Victor Stinner95663112011-10-04 01:03:50 +0200954 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100955 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000958
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 if (PyUnicode_IS_READY(unicode)) {
960 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200961 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200962 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200963#ifdef Py_DEBUG
964 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
965#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200968 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200969 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
970 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200971
972 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
973 PyErr_NoMemory();
974 return -1;
975 }
976 new_size = (length + 1) * char_size;
977
Victor Stinner7a9105a2011-12-12 00:13:42 +0100978 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
979 {
980 PyObject_DEL(_PyUnicode_UTF8(unicode));
981 _PyUnicode_UTF8(unicode) = NULL;
982 _PyUnicode_UTF8_LENGTH(unicode) = 0;
983 }
984
Victor Stinnerfe226c02011-10-03 03:52:20 +0200985 data = (PyObject *)PyObject_REALLOC(data, new_size);
986 if (data == NULL) {
987 PyErr_NoMemory();
988 return -1;
989 }
990 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200991 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200992 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200993 _PyUnicode_WSTR_LENGTH(unicode) = length;
994 }
995 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200996 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200997 _PyUnicode_UTF8_LENGTH(unicode) = length;
998 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 _PyUnicode_LENGTH(unicode) = length;
1000 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001001#ifdef Py_DEBUG
1002 unicode_fill_invalid(unicode, old_length);
1003#endif
Victor Stinner95663112011-10-04 01:03:50 +02001004 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001005 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001007 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008 }
Victor Stinner95663112011-10-04 01:03:50 +02001009 assert(_PyUnicode_WSTR(unicode) != NULL);
1010
1011 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001012 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001013 PyErr_NoMemory();
1014 return -1;
1015 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001016 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001017 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001018 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001019 if (!wstr) {
1020 PyErr_NoMemory();
1021 return -1;
1022 }
1023 _PyUnicode_WSTR(unicode) = wstr;
1024 _PyUnicode_WSTR(unicode)[length] = 0;
1025 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001026 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001027 return 0;
1028}
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_copy(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001034 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001036
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001037 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001038
1039 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1040 if (copy == NULL)
1041 return NULL;
1042
1043 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001044 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001046 }
1047 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001048 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001050 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001051 if (w == NULL)
1052 return NULL;
1053 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1054 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001055 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001056 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001057 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 }
1059}
1060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001062 Ux0000 terminated; some code (e.g. new_identifier)
1063 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064
1065 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001066 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067
1068*/
1069
Alexander Belopolsky40018472011-02-26 01:02:56 +00001070static PyUnicodeObject *
1071_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001073 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
Thomas Wouters477c8d52006-05-27 19:21:47 +00001076 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077 if (length == 0 && unicode_empty != NULL) {
1078 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001079 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 }
1081
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001082 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001083 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001084 return (PyUnicodeObject *)PyErr_NoMemory();
1085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 if (length < 0) {
1087 PyErr_SetString(PyExc_SystemError,
1088 "Negative size passed to _PyUnicode_New");
1089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090 }
1091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001092 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1093 if (unicode == NULL)
1094 return NULL;
1095 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001096
1097 _PyUnicode_WSTR_LENGTH(unicode) = length;
1098 _PyUnicode_HASH(unicode) = -1;
1099 _PyUnicode_STATE(unicode).interned = 0;
1100 _PyUnicode_STATE(unicode).kind = 0;
1101 _PyUnicode_STATE(unicode).compact = 0;
1102 _PyUnicode_STATE(unicode).ready = 0;
1103 _PyUnicode_STATE(unicode).ascii = 0;
1104 _PyUnicode_DATA_ANY(unicode) = NULL;
1105 _PyUnicode_LENGTH(unicode) = 0;
1106 _PyUnicode_UTF8(unicode) = NULL;
1107 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1110 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001111 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001112 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001113 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115
Jeremy Hyltond8082792003-09-16 19:41:39 +00001116 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001117 * the caller fails before initializing str -- unicode_resize()
1118 * reads str[0], and the Keep-Alive optimization can keep memory
1119 * allocated for str alive across a call to unicode_dealloc(unicode).
1120 * We don't want unicode_resize to read uninitialized memory in
1121 * that case.
1122 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 _PyUnicode_WSTR(unicode)[0] = 0;
1124 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001125
Victor Stinner7931d9a2011-11-04 00:22:48 +01001126 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 return unicode;
1128}
1129
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130static const char*
1131unicode_kind_name(PyObject *unicode)
1132{
Victor Stinner42dfd712011-10-03 14:41:45 +02001133 /* don't check consistency: unicode_kind_name() is called from
1134 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 if (!PyUnicode_IS_COMPACT(unicode))
1136 {
1137 if (!PyUnicode_IS_READY(unicode))
1138 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001139 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001140 {
1141 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001142 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001143 return "legacy ascii";
1144 else
1145 return "legacy latin1";
1146 case PyUnicode_2BYTE_KIND:
1147 return "legacy UCS2";
1148 case PyUnicode_4BYTE_KIND:
1149 return "legacy UCS4";
1150 default:
1151 return "<legacy invalid kind>";
1152 }
1153 }
1154 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001155 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 return "ascii";
1159 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001162 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001163 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001164 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001165 default:
1166 return "<invalid compact kind>";
1167 }
1168}
1169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171/* Functions wrapping macros for use in debugger */
1172char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001173 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174}
1175
1176void *_PyUnicode_compact_data(void *unicode) {
1177 return _PyUnicode_COMPACT_DATA(unicode);
1178}
1179void *_PyUnicode_data(void *unicode){
1180 printf("obj %p\n", unicode);
1181 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1182 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1183 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1184 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1185 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1186 return PyUnicode_DATA(unicode);
1187}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001188
1189void
1190_PyUnicode_Dump(PyObject *op)
1191{
1192 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1194 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1195 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001196
Victor Stinnera849a4b2011-10-03 12:12:11 +02001197 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001198 {
1199 if (ascii->state.ascii)
1200 data = (ascii + 1);
1201 else
1202 data = (compact + 1);
1203 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 else
1205 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001206 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1207 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera849a4b2011-10-03 12:12:11 +02001209 if (ascii->wstr == data)
1210 printf("shared ");
1211 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001212
Victor Stinnera3b334d2011-10-03 13:53:37 +02001213 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001214 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1216 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1218 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001219 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001221}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222#endif
1223
1224PyObject *
1225PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1226{
1227 PyObject *obj;
1228 PyCompactUnicodeObject *unicode;
1229 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001230 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001231 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001232 Py_ssize_t char_size;
1233 Py_ssize_t struct_size;
1234
1235 /* Optimization for empty strings */
1236 if (size == 0 && unicode_empty != NULL) {
1237 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001238 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 }
1240
Victor Stinner9e9d6892011-10-04 01:02:02 +02001241 is_ascii = 0;
1242 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 struct_size = sizeof(PyCompactUnicodeObject);
1244 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001245 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 char_size = 1;
1247 is_ascii = 1;
1248 struct_size = sizeof(PyASCIIObject);
1249 }
1250 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 1;
1253 }
1254 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001255 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 char_size = 2;
1257 if (sizeof(wchar_t) == 2)
1258 is_sharing = 1;
1259 }
1260 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001261 if (maxchar > MAX_UNICODE) {
1262 PyErr_SetString(PyExc_SystemError,
1263 "invalid maximum character passed to PyUnicode_New");
1264 return NULL;
1265 }
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 4;
1268 if (sizeof(wchar_t) == 4)
1269 is_sharing = 1;
1270 }
1271
1272 /* Ensure we won't overflow the size. */
1273 if (size < 0) {
1274 PyErr_SetString(PyExc_SystemError,
1275 "Negative size passed to PyUnicode_New");
1276 return NULL;
1277 }
1278 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1279 return PyErr_NoMemory();
1280
1281 /* Duplicated allocation code from _PyObject_New() instead of a call to
1282 * PyObject_New() so we are able to allocate space for the object and
1283 * it's data buffer.
1284 */
1285 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1286 if (obj == NULL)
1287 return PyErr_NoMemory();
1288 obj = PyObject_INIT(obj, &PyUnicode_Type);
1289 if (obj == NULL)
1290 return NULL;
1291
1292 unicode = (PyCompactUnicodeObject *)obj;
1293 if (is_ascii)
1294 data = ((PyASCIIObject*)obj) + 1;
1295 else
1296 data = unicode + 1;
1297 _PyUnicode_LENGTH(unicode) = size;
1298 _PyUnicode_HASH(unicode) = -1;
1299 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001300 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301 _PyUnicode_STATE(unicode).compact = 1;
1302 _PyUnicode_STATE(unicode).ready = 1;
1303 _PyUnicode_STATE(unicode).ascii = is_ascii;
1304 if (is_ascii) {
1305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 }
Victor Stinner8f825062012-04-27 13:55:39 +02001308 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 ((char*)data)[size] = 0;
1310 _PyUnicode_WSTR(unicode) = NULL;
1311 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 else {
1316 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001317 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001318 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001320 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ((Py_UCS4*)data)[size] = 0;
1322 if (is_sharing) {
1323 _PyUnicode_WSTR_LENGTH(unicode) = size;
1324 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1325 }
1326 else {
1327 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1328 _PyUnicode_WSTR(unicode) = NULL;
1329 }
1330 }
Victor Stinner8f825062012-04-27 13:55:39 +02001331#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001332 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001333#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001334 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 return obj;
1336}
1337
1338#if SIZEOF_WCHAR_T == 2
1339/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1340 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001341 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342
1343 This function assumes that unicode can hold one more code point than wstr
1344 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001345static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001347 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348{
1349 const wchar_t *iter;
1350 Py_UCS4 *ucs4_out;
1351
Victor Stinner910337b2011-10-03 03:20:16 +02001352 assert(unicode != NULL);
1353 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1355 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1356
1357 for (iter = begin; iter < end; ) {
1358 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1359 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001360 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1361 && (iter+1) < end
1362 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 {
Victor Stinner551ac952011-11-29 22:58:13 +01001364 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 iter += 2;
1366 }
1367 else {
1368 *ucs4_out++ = *iter;
1369 iter++;
1370 }
1371 }
1372 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1373 _PyUnicode_GET_LENGTH(unicode)));
1374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375}
1376#endif
1377
Victor Stinnercd9950f2011-10-02 00:34:53 +02001378static int
Victor Stinner488fa492011-12-12 00:01:39 +01001379unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380{
Victor Stinner488fa492011-12-12 00:01:39 +01001381 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001382 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001383 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001384 return -1;
1385 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001386 return 0;
1387}
1388
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001389static int
1390_copy_characters(PyObject *to, Py_ssize_t to_start,
1391 PyObject *from, Py_ssize_t from_start,
1392 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 unsigned int from_kind, to_kind;
1395 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(0 <= how_many);
1398 assert(0 <= from_start);
1399 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001401 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001402 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403
Victor Stinnerd3f08822012-05-29 12:57:52 +02001404 assert(PyUnicode_Check(to));
1405 assert(PyUnicode_IS_READY(to));
1406 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1407
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001408 if (how_many == 0)
1409 return 0;
1410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001412 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001414 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Victor Stinnerf1852262012-06-16 16:38:26 +02001416#ifdef Py_DEBUG
1417 if (!check_maxchar
1418 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1419 {
1420 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1421 Py_UCS4 ch;
1422 Py_ssize_t i;
1423 for (i=0; i < how_many; i++) {
1424 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1425 assert(ch <= to_maxchar);
1426 }
1427 }
1428#endif
1429
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 if (check_maxchar
1432 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1433 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001434 /* Writing Latin-1 characters into an ASCII string requires to
1435 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001436 Py_UCS4 max_char;
1437 max_char = ucs1lib_find_max_char(from_data,
1438 (Py_UCS1*)from_data + how_many);
1439 if (max_char >= 128)
1440 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 }
Christian Heimesf051e432016-09-13 20:22:02 +02001442 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001443 (char*)from_data + from_kind * from_start,
1444 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001446 else if (from_kind == PyUnicode_1BYTE_KIND
1447 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001448 {
1449 _PyUnicode_CONVERT_BYTES(
1450 Py_UCS1, Py_UCS2,
1451 PyUnicode_1BYTE_DATA(from) + from_start,
1452 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1453 PyUnicode_2BYTE_DATA(to) + to_start
1454 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001455 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001456 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001457 && to_kind == PyUnicode_4BYTE_KIND)
1458 {
1459 _PyUnicode_CONVERT_BYTES(
1460 Py_UCS1, Py_UCS4,
1461 PyUnicode_1BYTE_DATA(from) + from_start,
1462 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1463 PyUnicode_4BYTE_DATA(to) + to_start
1464 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001465 }
1466 else if (from_kind == PyUnicode_2BYTE_KIND
1467 && to_kind == PyUnicode_4BYTE_KIND)
1468 {
1469 _PyUnicode_CONVERT_BYTES(
1470 Py_UCS2, Py_UCS4,
1471 PyUnicode_2BYTE_DATA(from) + from_start,
1472 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1473 PyUnicode_4BYTE_DATA(to) + to_start
1474 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001475 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001476 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001477 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1478
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001479 if (!check_maxchar) {
1480 if (from_kind == PyUnicode_2BYTE_KIND
1481 && to_kind == PyUnicode_1BYTE_KIND)
1482 {
1483 _PyUnicode_CONVERT_BYTES(
1484 Py_UCS2, Py_UCS1,
1485 PyUnicode_2BYTE_DATA(from) + from_start,
1486 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1487 PyUnicode_1BYTE_DATA(to) + to_start
1488 );
1489 }
1490 else if (from_kind == PyUnicode_4BYTE_KIND
1491 && to_kind == PyUnicode_1BYTE_KIND)
1492 {
1493 _PyUnicode_CONVERT_BYTES(
1494 Py_UCS4, Py_UCS1,
1495 PyUnicode_4BYTE_DATA(from) + from_start,
1496 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1497 PyUnicode_1BYTE_DATA(to) + to_start
1498 );
1499 }
1500 else if (from_kind == PyUnicode_4BYTE_KIND
1501 && to_kind == PyUnicode_2BYTE_KIND)
1502 {
1503 _PyUnicode_CONVERT_BYTES(
1504 Py_UCS4, Py_UCS2,
1505 PyUnicode_4BYTE_DATA(from) + from_start,
1506 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1507 PyUnicode_2BYTE_DATA(to) + to_start
1508 );
1509 }
1510 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001511 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001512 }
1513 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001514 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001516 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001517 Py_ssize_t i;
1518
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 for (i=0; i < how_many; i++) {
1520 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001521 if (ch > to_maxchar)
1522 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001523 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1524 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001525 }
1526 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001527 return 0;
1528}
1529
Victor Stinnerd3f08822012-05-29 12:57:52 +02001530void
1531_PyUnicode_FastCopyCharacters(
1532 PyObject *to, Py_ssize_t to_start,
1533 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001534{
1535 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1536}
1537
1538Py_ssize_t
1539PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1540 PyObject *from, Py_ssize_t from_start,
1541 Py_ssize_t how_many)
1542{
1543 int err;
1544
1545 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1546 PyErr_BadInternalCall();
1547 return -1;
1548 }
1549
Benjamin Petersonbac79492012-01-14 13:34:47 -05001550 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001551 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001552 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 return -1;
1554
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001555 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001559 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001560 PyErr_SetString(PyExc_IndexError, "string index out of range");
1561 return -1;
1562 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001563 if (how_many < 0) {
1564 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1565 return -1;
1566 }
1567 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001568 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1569 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001570 "Cannot write %zi characters at %zi "
1571 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572 how_many, to_start, PyUnicode_GET_LENGTH(to));
1573 return -1;
1574 }
1575
1576 if (how_many == 0)
1577 return 0;
1578
Victor Stinner488fa492011-12-12 00:01:39 +01001579 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001580 return -1;
1581
1582 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1583 if (err) {
1584 PyErr_Format(PyExc_SystemError,
1585 "Cannot copy %s characters "
1586 "into a string of %s characters",
1587 unicode_kind_name(from),
1588 unicode_kind_name(to));
1589 return -1;
1590 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592}
1593
Victor Stinner17222162011-09-28 22:15:37 +02001594/* Find the maximum code point and count the number of surrogate pairs so a
1595 correct string length can be computed before converting a string to UCS4.
1596 This function counts single surrogates as a character and not as a pair.
1597
1598 Return 0 on success, or -1 on error. */
1599static int
1600find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1601 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602{
1603 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001604 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605
Victor Stinnerc53be962011-10-02 21:33:54 +02001606 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 *num_surrogates = 0;
1608 *maxchar = 0;
1609
1610 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001612 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1613 && (iter+1) < end
1614 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1615 {
1616 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1617 ++(*num_surrogates);
1618 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 }
1620 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001622 {
1623 ch = *iter;
1624 iter++;
1625 }
1626 if (ch > *maxchar) {
1627 *maxchar = ch;
1628 if (*maxchar > MAX_UNICODE) {
1629 PyErr_Format(PyExc_ValueError,
1630 "character U+%x is not in range [U+0000; U+10ffff]",
1631 ch);
1632 return -1;
1633 }
1634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 }
1636 return 0;
1637}
1638
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001639int
1640_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641{
1642 wchar_t *end;
1643 Py_UCS4 maxchar = 0;
1644 Py_ssize_t num_surrogates;
1645#if SIZEOF_WCHAR_T == 2
1646 Py_ssize_t length_wo_surrogates;
1647#endif
1648
Georg Brandl7597add2011-10-05 16:36:47 +02001649 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001650 strings were created using _PyObject_New() and where no canonical
1651 representation (the str field) has been set yet aka strings
1652 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001653 assert(_PyUnicode_CHECK(unicode));
1654 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001656 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001657 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001658 /* Actually, it should neither be interned nor be anything else: */
1659 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001662 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001663 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665
1666 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001667 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1668 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 PyErr_NoMemory();
1670 return -1;
1671 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001672 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 _PyUnicode_WSTR(unicode), end,
1674 PyUnicode_1BYTE_DATA(unicode));
1675 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1676 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1677 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1678 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001679 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001680 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001681 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 }
1683 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001684 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001685 _PyUnicode_UTF8(unicode) = NULL;
1686 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 }
1688 PyObject_FREE(_PyUnicode_WSTR(unicode));
1689 _PyUnicode_WSTR(unicode) = NULL;
1690 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1691 }
1692 /* In this case we might have to convert down from 4-byte native
1693 wchar_t to 2-byte unicode. */
1694 else if (maxchar < 65536) {
1695 assert(num_surrogates == 0 &&
1696 "FindMaxCharAndNumSurrogatePairs() messed up");
1697
Victor Stinner506f5922011-09-28 22:34:18 +02001698#if SIZEOF_WCHAR_T == 2
1699 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001700 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001701 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1702 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1703 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001704 _PyUnicode_UTF8(unicode) = NULL;
1705 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001706#else
1707 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001708 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001709 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001710 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001711 PyErr_NoMemory();
1712 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 }
Victor Stinner506f5922011-09-28 22:34:18 +02001714 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1715 _PyUnicode_WSTR(unicode), end,
1716 PyUnicode_2BYTE_DATA(unicode));
1717 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1718 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1719 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001720 _PyUnicode_UTF8(unicode) = NULL;
1721 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001722 PyObject_FREE(_PyUnicode_WSTR(unicode));
1723 _PyUnicode_WSTR(unicode) = NULL;
1724 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1725#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 }
1727 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1728 else {
1729#if SIZEOF_WCHAR_T == 2
1730 /* in case the native representation is 2-bytes, we need to allocate a
1731 new normalized 4-byte version. */
1732 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001733 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1734 PyErr_NoMemory();
1735 return -1;
1736 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001737 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1738 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 PyErr_NoMemory();
1740 return -1;
1741 }
1742 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1743 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001744 _PyUnicode_UTF8(unicode) = NULL;
1745 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001746 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1747 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001748 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 PyObject_FREE(_PyUnicode_WSTR(unicode));
1750 _PyUnicode_WSTR(unicode) = NULL;
1751 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1752#else
1753 assert(num_surrogates == 0);
1754
Victor Stinnerc3c74152011-10-02 20:39:55 +02001755 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1760#endif
1761 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1762 }
1763 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001764 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 return 0;
1766}
1767
Alexander Belopolsky40018472011-02-26 01:02:56 +00001768static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001769unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770{
Walter Dörwald16807132007-05-25 13:52:07 +00001771 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001772 case SSTATE_NOT_INTERNED:
1773 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001774
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 case SSTATE_INTERNED_MORTAL:
1776 /* revive dead object temporarily for DelItem */
1777 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001778 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 Py_FatalError(
1780 "deletion of interned string failed");
1781 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 case SSTATE_INTERNED_IMMORTAL:
1784 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001785 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001786
Benjamin Peterson29060642009-01-31 22:14:21 +00001787 default:
1788 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001789 }
1790
Victor Stinner03490912011-10-03 23:45:12 +02001791 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001793 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001795 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1796 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001798 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799}
1800
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001801#ifdef Py_DEBUG
1802static int
1803unicode_is_singleton(PyObject *unicode)
1804{
1805 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1806 if (unicode == unicode_empty)
1807 return 1;
1808 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1809 {
1810 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1811 if (ch < 256 && unicode_latin1[ch] == unicode)
1812 return 1;
1813 }
1814 return 0;
1815}
1816#endif
1817
Alexander Belopolsky40018472011-02-26 01:02:56 +00001818static int
Victor Stinner488fa492011-12-12 00:01:39 +01001819unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001820{
Victor Stinner488fa492011-12-12 00:01:39 +01001821 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (Py_REFCNT(unicode) != 1)
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (_PyUnicode_HASH(unicode) != -1)
1825 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001826 if (PyUnicode_CHECK_INTERNED(unicode))
1827 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001828 if (!PyUnicode_CheckExact(unicode))
1829 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001830#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001831 /* singleton refcount is greater than 1 */
1832 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001833#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001834 return 1;
1835}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001836
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837static int
1838unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1839{
1840 PyObject *unicode;
1841 Py_ssize_t old_length;
1842
1843 assert(p_unicode != NULL);
1844 unicode = *p_unicode;
1845
1846 assert(unicode != NULL);
1847 assert(PyUnicode_Check(unicode));
1848 assert(0 <= length);
1849
Victor Stinner910337b2011-10-03 03:20:16 +02001850 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001851 old_length = PyUnicode_WSTR_LENGTH(unicode);
1852 else
1853 old_length = PyUnicode_GET_LENGTH(unicode);
1854 if (old_length == length)
1855 return 0;
1856
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001857 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001858 _Py_INCREF_UNICODE_EMPTY();
1859 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001860 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001861 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001862 return 0;
1863 }
1864
Victor Stinner488fa492011-12-12 00:01:39 +01001865 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001866 PyObject *copy = resize_copy(unicode, length);
1867 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001868 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001869 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001870 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001871 }
1872
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001874 PyObject *new_unicode = resize_compact(unicode, length);
1875 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001876 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001877 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001878 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001880 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001881}
1882
Alexander Belopolsky40018472011-02-26 01:02:56 +00001883int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001885{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001886 PyObject *unicode;
1887 if (p_unicode == NULL) {
1888 PyErr_BadInternalCall();
1889 return -1;
1890 }
1891 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001892 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001893 {
1894 PyErr_BadInternalCall();
1895 return -1;
1896 }
1897 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001898}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001899
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001900/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001901
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001902 WARNING: The function doesn't copy the terminating null character and
1903 doesn't check the maximum character (may write a latin1 character in an
1904 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001905static void
1906unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1907 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001908{
1909 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1910 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001911 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
1913 switch (kind) {
1914 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001915 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001916#ifdef Py_DEBUG
1917 if (PyUnicode_IS_ASCII(unicode)) {
1918 Py_UCS4 maxchar = ucs1lib_find_max_char(
1919 (const Py_UCS1*)str,
1920 (const Py_UCS1*)str + len);
1921 assert(maxchar < 128);
1922 }
1923#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001924 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001925 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 }
1927 case PyUnicode_2BYTE_KIND: {
1928 Py_UCS2 *start = (Py_UCS2 *)data + index;
1929 Py_UCS2 *ucs2 = start;
1930 assert(index <= PyUnicode_GET_LENGTH(unicode));
1931
Victor Stinner184252a2012-06-16 02:57:41 +02001932 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 *ucs2 = (Py_UCS2)*str;
1934
1935 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 default: {
1939 Py_UCS4 *start = (Py_UCS4 *)data + index;
1940 Py_UCS4 *ucs4 = start;
1941 assert(kind == PyUnicode_4BYTE_KIND);
1942 assert(index <= PyUnicode_GET_LENGTH(unicode));
1943
Victor Stinner184252a2012-06-16 02:57:41 +02001944 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001945 *ucs4 = (Py_UCS4)*str;
1946
1947 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 }
1950}
1951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952static PyObject*
1953get_latin1_char(unsigned char ch)
1954{
Victor Stinnera464fc12011-10-02 20:39:30 +02001955 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001957 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 if (!unicode)
1959 return NULL;
1960 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001961 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962 unicode_latin1[ch] = unicode;
1963 }
1964 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001965 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966}
1967
Victor Stinner985a82a2014-01-03 12:53:47 +01001968static PyObject*
1969unicode_char(Py_UCS4 ch)
1970{
1971 PyObject *unicode;
1972
1973 assert(ch <= MAX_UNICODE);
1974
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001975 if (ch < 256)
1976 return get_latin1_char(ch);
1977
Victor Stinner985a82a2014-01-03 12:53:47 +01001978 unicode = PyUnicode_New(1, ch);
1979 if (unicode == NULL)
1980 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001981
1982 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1983 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001984 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001985 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001986 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1987 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1988 }
1989 assert(_PyUnicode_CheckConsistency(unicode, 1));
1990 return unicode;
1991}
1992
Alexander Belopolsky40018472011-02-26 01:02:56 +00001993PyObject *
1994PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001996 if (u == NULL)
1997 return (PyObject*)_PyUnicode_New(size);
1998
1999 if (size < 0) {
2000 PyErr_BadInternalCall();
2001 return NULL;
2002 }
2003
2004 return PyUnicode_FromWideChar(u, size);
2005}
2006
2007PyObject *
2008PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2009{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002010 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 Py_UCS4 maxchar = 0;
2012 Py_ssize_t num_surrogates;
2013
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002014 if (u == NULL && size != 0) {
2015 PyErr_BadInternalCall();
2016 return NULL;
2017 }
2018
2019 if (size == -1) {
2020 size = wcslen(u);
2021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002023 /* If the Unicode data is known at construction time, we can apply
2024 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002027 if (size == 0)
2028 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 /* Single character Unicode objects in the Latin-1 range are
2031 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002032 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return get_latin1_char((unsigned char)*u);
2034
2035 /* If not empty and not single character, copy the Unicode data
2036 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002037 if (find_maxchar_surrogates(u, u + size,
2038 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 return NULL;
2040
Victor Stinner8faf8212011-12-08 22:14:11 +01002041 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 if (!unicode)
2043 return NULL;
2044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 switch (PyUnicode_KIND(unicode)) {
2046 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002047 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2049 break;
2050 case PyUnicode_2BYTE_KIND:
2051#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002052 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002054 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2056#endif
2057 break;
2058 case PyUnicode_4BYTE_KIND:
2059#if SIZEOF_WCHAR_T == 2
2060 /* This is the only case which has to process surrogates, thus
2061 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002062 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063#else
2064 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002065 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066#endif
2067 break;
2068 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002069 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002072 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073}
2074
Alexander Belopolsky40018472011-02-26 01:02:56 +00002075PyObject *
2076PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002077{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002078 if (size < 0) {
2079 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002080 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 return NULL;
2082 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002083 if (u != NULL)
2084 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2085 else
2086 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002087}
2088
Alexander Belopolsky40018472011-02-26 01:02:56 +00002089PyObject *
2090PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002091{
2092 size_t size = strlen(u);
2093 if (size > PY_SSIZE_T_MAX) {
2094 PyErr_SetString(PyExc_OverflowError, "input too long");
2095 return NULL;
2096 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002097 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002098}
2099
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002100PyObject *
2101_PyUnicode_FromId(_Py_Identifier *id)
2102{
2103 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002104 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2105 strlen(id->string),
2106 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002107 if (!id->object)
2108 return NULL;
2109 PyUnicode_InternInPlace(&id->object);
2110 assert(!id->next);
2111 id->next = static_strings;
2112 static_strings = id;
2113 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002114 return id->object;
2115}
2116
2117void
2118_PyUnicode_ClearStaticStrings()
2119{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002120 _Py_Identifier *tmp, *s = static_strings;
2121 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002122 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002123 tmp = s->next;
2124 s->next = NULL;
2125 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002126 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002127 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002128}
2129
Benjamin Peterson0df54292012-03-26 14:50:32 -04002130/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002131
Victor Stinnerd3f08822012-05-29 12:57:52 +02002132PyObject*
2133_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002134{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002135 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002136 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002137 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002138#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002139 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002140#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002141 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002142 }
Victor Stinner785938e2011-12-11 20:09:03 +01002143 unicode = PyUnicode_New(size, 127);
2144 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002145 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002146 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2147 assert(_PyUnicode_CheckConsistency(unicode, 1));
2148 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002149}
2150
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002151static Py_UCS4
2152kind_maxchar_limit(unsigned int kind)
2153{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002154 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002155 case PyUnicode_1BYTE_KIND:
2156 return 0x80;
2157 case PyUnicode_2BYTE_KIND:
2158 return 0x100;
2159 case PyUnicode_4BYTE_KIND:
2160 return 0x10000;
2161 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002162 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002163 }
2164}
2165
Victor Stinner702c7342011-10-05 13:50:52 +02002166static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002167_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002171
Serhiy Storchaka678db842013-01-26 12:16:36 +02002172 if (size == 0)
2173 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002174 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002175 if (size == 1)
2176 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002177
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002178 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002179 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 if (!res)
2181 return NULL;
2182 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002183 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002185}
2186
Victor Stinnere57b1c02011-09-28 22:20:48 +02002187static PyObject*
2188_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189{
2190 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002192
Serhiy Storchaka678db842013-01-26 12:16:36 +02002193 if (size == 0)
2194 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002195 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002196 if (size == 1)
2197 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002198
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002199 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002200 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 if (!res)
2202 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002203 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002205 else {
2206 _PyUnicode_CONVERT_BYTES(
2207 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2208 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002209 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 return res;
2211}
2212
Victor Stinnere57b1c02011-09-28 22:20:48 +02002213static PyObject*
2214_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215{
2216 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002218
Serhiy Storchaka678db842013-01-26 12:16:36 +02002219 if (size == 0)
2220 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002221 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002222 if (size == 1)
2223 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002224
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002225 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002226 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 if (!res)
2228 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002229 if (max_char < 256)
2230 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2231 PyUnicode_1BYTE_DATA(res));
2232 else if (max_char < 0x10000)
2233 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2234 PyUnicode_2BYTE_DATA(res));
2235 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002237 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 return res;
2239}
2240
2241PyObject*
2242PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2243{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002244 if (size < 0) {
2245 PyErr_SetString(PyExc_ValueError, "size must be positive");
2246 return NULL;
2247 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002248 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002250 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002252 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002254 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002256 PyErr_SetString(PyExc_SystemError, "invalid kind");
2257 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259}
2260
Victor Stinnerece58de2012-04-23 23:36:38 +02002261Py_UCS4
2262_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2263{
2264 enum PyUnicode_Kind kind;
2265 void *startptr, *endptr;
2266
2267 assert(PyUnicode_IS_READY(unicode));
2268 assert(0 <= start);
2269 assert(end <= PyUnicode_GET_LENGTH(unicode));
2270 assert(start <= end);
2271
2272 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2273 return PyUnicode_MAX_CHAR_VALUE(unicode);
2274
2275 if (start == end)
2276 return 127;
2277
Victor Stinner94d558b2012-04-27 22:26:58 +02002278 if (PyUnicode_IS_ASCII(unicode))
2279 return 127;
2280
Victor Stinnerece58de2012-04-23 23:36:38 +02002281 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002282 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002283 endptr = (char *)startptr + end * kind;
2284 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002285 switch(kind) {
2286 case PyUnicode_1BYTE_KIND:
2287 return ucs1lib_find_max_char(startptr, endptr);
2288 case PyUnicode_2BYTE_KIND:
2289 return ucs2lib_find_max_char(startptr, endptr);
2290 case PyUnicode_4BYTE_KIND:
2291 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002292 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002293 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002294 }
2295}
2296
Victor Stinner25a4b292011-10-06 12:31:55 +02002297/* Ensure that a string uses the most efficient storage, if it is not the
2298 case: create a new string with of the right kind. Write NULL into *p_unicode
2299 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002300static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002301unicode_adjust_maxchar(PyObject **p_unicode)
2302{
2303 PyObject *unicode, *copy;
2304 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002305 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002306 unsigned int kind;
2307
2308 assert(p_unicode != NULL);
2309 unicode = *p_unicode;
2310 assert(PyUnicode_IS_READY(unicode));
2311 if (PyUnicode_IS_ASCII(unicode))
2312 return;
2313
2314 len = PyUnicode_GET_LENGTH(unicode);
2315 kind = PyUnicode_KIND(unicode);
2316 if (kind == PyUnicode_1BYTE_KIND) {
2317 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs1lib_find_max_char(u, u + len);
2319 if (max_char >= 128)
2320 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002321 }
2322 else if (kind == PyUnicode_2BYTE_KIND) {
2323 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002324 max_char = ucs2lib_find_max_char(u, u + len);
2325 if (max_char >= 256)
2326 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002327 }
2328 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002329 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002330 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002331 max_char = ucs4lib_find_max_char(u, u + len);
2332 if (max_char >= 0x10000)
2333 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002334 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002335 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002336 if (copy != NULL)
2337 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002338 Py_DECREF(unicode);
2339 *p_unicode = copy;
2340}
2341
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002343_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002344{
Victor Stinner87af4f22011-11-21 23:03:47 +01002345 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002347
Victor Stinner034f6cf2011-09-30 02:26:44 +02002348 if (!PyUnicode_Check(unicode)) {
2349 PyErr_BadInternalCall();
2350 return NULL;
2351 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002352 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002353 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002354
Victor Stinner87af4f22011-11-21 23:03:47 +01002355 length = PyUnicode_GET_LENGTH(unicode);
2356 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002357 if (!copy)
2358 return NULL;
2359 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2360
Christian Heimesf051e432016-09-13 20:22:02 +02002361 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002362 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002363 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002364 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002365}
2366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368/* Widen Unicode objects to larger buffers. Don't write terminating null
2369 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370
2371void*
2372_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2373{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002374 Py_ssize_t len;
2375 void *result;
2376 unsigned int skind;
2377
Benjamin Petersonbac79492012-01-14 13:34:47 -05002378 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 return NULL;
2380
2381 len = PyUnicode_GET_LENGTH(s);
2382 skind = PyUnicode_KIND(s);
2383 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002384 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 return NULL;
2386 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002387 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002388 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002389 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 if (!result)
2391 return PyErr_NoMemory();
2392 assert(skind == PyUnicode_1BYTE_KIND);
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS1, Py_UCS2,
2395 PyUnicode_1BYTE_DATA(s),
2396 PyUnicode_1BYTE_DATA(s) + len,
2397 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002399 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002400 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002401 if (!result)
2402 return PyErr_NoMemory();
2403 if (skind == PyUnicode_2BYTE_KIND) {
2404 _PyUnicode_CONVERT_BYTES(
2405 Py_UCS2, Py_UCS4,
2406 PyUnicode_2BYTE_DATA(s),
2407 PyUnicode_2BYTE_DATA(s) + len,
2408 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002410 else {
2411 assert(skind == PyUnicode_1BYTE_KIND);
2412 _PyUnicode_CONVERT_BYTES(
2413 Py_UCS1, Py_UCS4,
2414 PyUnicode_1BYTE_DATA(s),
2415 PyUnicode_1BYTE_DATA(s) + len,
2416 result);
2417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002419 default:
2420 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 }
Victor Stinner01698042011-10-04 00:04:26 +02002422 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 return NULL;
2424}
2425
2426static Py_UCS4*
2427as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2428 int copy_null)
2429{
2430 int kind;
2431 void *data;
2432 Py_ssize_t len, targetlen;
2433 if (PyUnicode_READY(string) == -1)
2434 return NULL;
2435 kind = PyUnicode_KIND(string);
2436 data = PyUnicode_DATA(string);
2437 len = PyUnicode_GET_LENGTH(string);
2438 targetlen = len;
2439 if (copy_null)
2440 targetlen++;
2441 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002442 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 if (!target) {
2444 PyErr_NoMemory();
2445 return NULL;
2446 }
2447 }
2448 else {
2449 if (targetsize < targetlen) {
2450 PyErr_Format(PyExc_SystemError,
2451 "string is longer than the buffer");
2452 if (copy_null && 0 < targetsize)
2453 target[0] = 0;
2454 return NULL;
2455 }
2456 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 if (kind == PyUnicode_1BYTE_KIND) {
2458 Py_UCS1 *start = (Py_UCS1 *) data;
2459 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002461 else if (kind == PyUnicode_2BYTE_KIND) {
2462 Py_UCS2 *start = (Py_UCS2 *) data;
2463 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2464 }
2465 else {
2466 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002467 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (copy_null)
2470 target[len] = 0;
2471 return target;
2472}
2473
2474Py_UCS4*
2475PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2476 int copy_null)
2477{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002478 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 PyErr_BadInternalCall();
2480 return NULL;
2481 }
2482 return as_ucs4(string, target, targetsize, copy_null);
2483}
2484
2485Py_UCS4*
2486PyUnicode_AsUCS4Copy(PyObject *string)
2487{
2488 return as_ucs4(string, NULL, 0, 1);
2489}
2490
Victor Stinner15a11362012-10-06 23:48:20 +02002491/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002492 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2493 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2494#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002495
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002496static int
2497unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2498 Py_ssize_t width, Py_ssize_t precision)
2499{
2500 Py_ssize_t length, fill, arglen;
2501 Py_UCS4 maxchar;
2502
2503 if (PyUnicode_READY(str) == -1)
2504 return -1;
2505
2506 length = PyUnicode_GET_LENGTH(str);
2507 if ((precision == -1 || precision >= length)
2508 && width <= length)
2509 return _PyUnicodeWriter_WriteStr(writer, str);
2510
2511 if (precision != -1)
2512 length = Py_MIN(precision, length);
2513
2514 arglen = Py_MAX(length, width);
2515 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2516 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2517 else
2518 maxchar = writer->maxchar;
2519
2520 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2521 return -1;
2522
2523 if (width > length) {
2524 fill = width - length;
2525 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2526 return -1;
2527 writer->pos += fill;
2528 }
2529
2530 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2531 str, 0, length);
2532 writer->pos += length;
2533 return 0;
2534}
2535
2536static int
Victor Stinner998b8062018-09-12 00:23:25 +02002537unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002538 Py_ssize_t width, Py_ssize_t precision)
2539{
2540 /* UTF-8 */
2541 Py_ssize_t length;
2542 PyObject *unicode;
2543 int res;
2544
2545 length = strlen(str);
2546 if (precision != -1)
2547 length = Py_MIN(length, precision);
2548 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2549 if (unicode == NULL)
2550 return -1;
2551
2552 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2553 Py_DECREF(unicode);
2554 return res;
2555}
2556
Victor Stinner96865452011-03-01 23:44:09 +00002557static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002558unicode_fromformat_arg(_PyUnicodeWriter *writer,
2559 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002560{
Victor Stinnere215d962012-10-06 23:03:36 +02002561 const char *p;
2562 Py_ssize_t len;
2563 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002564 Py_ssize_t width;
2565 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002566 int longflag;
2567 int longlongflag;
2568 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002569 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002570
2571 p = f;
2572 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002573 zeropad = 0;
2574 if (*f == '0') {
2575 zeropad = 1;
2576 f++;
2577 }
Victor Stinner96865452011-03-01 23:44:09 +00002578
2579 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 width = -1;
2581 if (Py_ISDIGIT((unsigned)*f)) {
2582 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002583 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002584 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002586 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002587 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002588 return NULL;
2589 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002591 f++;
2592 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 }
2594 precision = -1;
2595 if (*f == '.') {
2596 f++;
2597 if (Py_ISDIGIT((unsigned)*f)) {
2598 precision = (*f - '0');
2599 f++;
2600 while (Py_ISDIGIT((unsigned)*f)) {
2601 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2602 PyErr_SetString(PyExc_ValueError,
2603 "precision too big");
2604 return NULL;
2605 }
2606 precision = (precision * 10) + (*f - '0');
2607 f++;
2608 }
2609 }
Victor Stinner96865452011-03-01 23:44:09 +00002610 if (*f == '%') {
2611 /* "%.3%s" => f points to "3" */
2612 f--;
2613 }
2614 }
2615 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002616 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002617 f--;
2618 }
Victor Stinner96865452011-03-01 23:44:09 +00002619
2620 /* Handle %ld, %lu, %lld and %llu. */
2621 longflag = 0;
2622 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002623 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002624 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002625 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002626 longflag = 1;
2627 ++f;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002630 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002631 longlongflag = 1;
2632 f += 2;
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634 }
2635 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002636 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002637 size_tflag = 1;
2638 ++f;
2639 }
Victor Stinnere215d962012-10-06 23:03:36 +02002640
2641 if (f[1] == '\0')
2642 writer->overallocate = 0;
2643
2644 switch (*f) {
2645 case 'c':
2646 {
2647 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002648 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002649 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002650 "character argument not in range(0x110000)");
2651 return NULL;
2652 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002653 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002654 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002655 break;
2656 }
2657
2658 case 'i':
2659 case 'd':
2660 case 'u':
2661 case 'x':
2662 {
2663 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002664 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002665 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002666
2667 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002668 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002669 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002670 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002671 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002672 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002673 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002674 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002675 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002676 va_arg(*vargs, size_t));
2677 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002678 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002679 va_arg(*vargs, unsigned int));
2680 }
2681 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002683 }
2684 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002685 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002686 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002688 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002689 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002690 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002691 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002692 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002693 va_arg(*vargs, Py_ssize_t));
2694 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002695 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_arg(*vargs, int));
2697 }
2698 assert(len >= 0);
2699
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (precision < len)
2701 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002702
2703 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2705 return NULL;
2706
Victor Stinnere215d962012-10-06 23:03:36 +02002707 if (width > precision) {
2708 Py_UCS4 fillchar;
2709 fill = width - precision;
2710 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002711 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2712 return NULL;
2713 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002714 }
Victor Stinner15a11362012-10-06 23:48:20 +02002715 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002716 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002717 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2718 return NULL;
2719 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002720 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002721
Victor Stinner4a587072013-11-19 12:54:53 +01002722 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2723 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002724 break;
2725 }
2726
2727 case 'p':
2728 {
2729 char number[MAX_LONG_LONG_CHARS];
2730
2731 len = sprintf(number, "%p", va_arg(*vargs, void*));
2732 assert(len >= 0);
2733
2734 /* %p is ill-defined: ensure leading 0x. */
2735 if (number[1] == 'X')
2736 number[1] = 'x';
2737 else if (number[1] != 'x') {
2738 memmove(number + 2, number,
2739 strlen(number) + 1);
2740 number[0] = '0';
2741 number[1] = 'x';
2742 len += 2;
2743 }
2744
Victor Stinner4a587072013-11-19 12:54:53 +01002745 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002746 return NULL;
2747 break;
2748 }
2749
2750 case 's':
2751 {
2752 /* UTF-8 */
2753 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002754 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002755 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002756 break;
2757 }
2758
2759 case 'U':
2760 {
2761 PyObject *obj = va_arg(*vargs, PyObject *);
2762 assert(obj && _PyUnicode_CHECK(obj));
2763
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 return NULL;
2766 break;
2767 }
2768
2769 case 'V':
2770 {
2771 PyObject *obj = va_arg(*vargs, PyObject *);
2772 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002773 if (obj) {
2774 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002776 return NULL;
2777 }
2778 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002780 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002782 }
2783 break;
2784 }
2785
2786 case 'S':
2787 {
2788 PyObject *obj = va_arg(*vargs, PyObject *);
2789 PyObject *str;
2790 assert(obj);
2791 str = PyObject_Str(obj);
2792 if (!str)
2793 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002795 Py_DECREF(str);
2796 return NULL;
2797 }
2798 Py_DECREF(str);
2799 break;
2800 }
2801
2802 case 'R':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 PyObject *repr;
2806 assert(obj);
2807 repr = PyObject_Repr(obj);
2808 if (!repr)
2809 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002811 Py_DECREF(repr);
2812 return NULL;
2813 }
2814 Py_DECREF(repr);
2815 break;
2816 }
2817
2818 case 'A':
2819 {
2820 PyObject *obj = va_arg(*vargs, PyObject *);
2821 PyObject *ascii;
2822 assert(obj);
2823 ascii = PyObject_ASCII(obj);
2824 if (!ascii)
2825 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002827 Py_DECREF(ascii);
2828 return NULL;
2829 }
2830 Py_DECREF(ascii);
2831 break;
2832 }
2833
2834 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002835 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002836 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002837 break;
2838
2839 default:
2840 /* if we stumble upon an unknown formatting code, copy the rest
2841 of the format string to the output string. (we cannot just
2842 skip the code, since there's no way to know what's in the
2843 argument list) */
2844 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002845 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002846 return NULL;
2847 f = p+len;
2848 return f;
2849 }
2850
2851 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002852 return f;
2853}
2854
Walter Dörwaldd2034312007-05-18 16:29:38 +00002855PyObject *
2856PyUnicode_FromFormatV(const char *format, va_list vargs)
2857{
Victor Stinnere215d962012-10-06 23:03:36 +02002858 va_list vargs2;
2859 const char *f;
2860 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861
Victor Stinner8f674cc2013-04-17 23:02:17 +02002862 _PyUnicodeWriter_Init(&writer);
2863 writer.min_length = strlen(format) + 100;
2864 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002865
Benjamin Peterson0c212142016-09-20 20:39:33 -07002866 // Copy varags to be able to pass a reference to a subfunction.
2867 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002868
2869 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002871 f = unicode_fromformat_arg(&writer, f, &vargs2);
2872 if (f == NULL)
2873 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002876 const char *p;
2877 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002878
Victor Stinnere215d962012-10-06 23:03:36 +02002879 p = f;
2880 do
2881 {
2882 if ((unsigned char)*p > 127) {
2883 PyErr_Format(PyExc_ValueError,
2884 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2885 "string, got a non-ASCII byte: 0x%02x",
2886 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002887 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002888 }
2889 p++;
2890 }
2891 while (*p != '\0' && *p != '%');
2892 len = p - f;
2893
2894 if (*p == '\0')
2895 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002896
2897 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002898 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002899
2900 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002902 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002903 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002904 return _PyUnicodeWriter_Finish(&writer);
2905
2906 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002907 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002908 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002909 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002910}
2911
Walter Dörwaldd2034312007-05-18 16:29:38 +00002912PyObject *
2913PyUnicode_FromFormat(const char *format, ...)
2914{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 PyObject* ret;
2916 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002917
2918#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002921 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002923 ret = PyUnicode_FromFormatV(format, vargs);
2924 va_end(vargs);
2925 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926}
2927
Serhiy Storchakac46db922018-10-23 22:58:24 +03002928static Py_ssize_t
2929unicode_get_widechar_size(PyObject *unicode)
2930{
2931 Py_ssize_t res;
2932
2933 assert(unicode != NULL);
2934 assert(_PyUnicode_CHECK(unicode));
2935
2936 if (_PyUnicode_WSTR(unicode) != NULL) {
2937 return PyUnicode_WSTR_LENGTH(unicode);
2938 }
2939 assert(PyUnicode_IS_READY(unicode));
2940
2941 res = _PyUnicode_LENGTH(unicode);
2942#if SIZEOF_WCHAR_T == 2
2943 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
2944 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2945 const Py_UCS4 *end = s + res;
2946 for (; s < end; ++s) {
2947 if (*s > 0xFFFF) {
2948 ++res;
2949 }
2950 }
2951 }
2952#endif
2953 return res;
2954}
2955
2956static void
2957unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
2958{
2959 const wchar_t *wstr;
2960
2961 assert(unicode != NULL);
2962 assert(_PyUnicode_CHECK(unicode));
2963
2964 wstr = _PyUnicode_WSTR(unicode);
2965 if (wstr != NULL) {
2966 memcpy(w, wstr, size * sizeof(wchar_t));
2967 return;
2968 }
2969 assert(PyUnicode_IS_READY(unicode));
2970
2971 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
2972 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
2973 for (; size--; ++s, ++w) {
2974 *w = *s;
2975 }
2976 }
2977 else {
2978#if SIZEOF_WCHAR_T == 4
2979 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
2980 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
2981 for (; size--; ++s, ++w) {
2982 *w = *s;
2983 }
2984#else
2985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2986 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
2987 for (; size--; ++s, ++w) {
2988 Py_UCS4 ch = *s;
2989 if (ch > 0xFFFF) {
2990 assert(ch <= MAX_UNICODE);
2991 /* encode surrogate pair in this case */
2992 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
2993 if (!size--)
2994 break;
2995 *w = Py_UNICODE_LOW_SURROGATE(ch);
2996 }
2997 else {
2998 *w = ch;
2999 }
3000 }
3001#endif
3002 }
3003}
3004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005#ifdef HAVE_WCHAR_H
3006
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003007/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003008
Victor Stinnerd88d9832011-09-06 02:00:05 +02003009 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003010 character) required to convert the unicode object. Ignore size argument.
3011
Victor Stinnerd88d9832011-09-06 02:00:05 +02003012 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003013 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003014 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003015Py_ssize_t
3016PyUnicode_AsWideChar(PyObject *unicode,
3017 wchar_t *w,
3018 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003019{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003020 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003021
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003022 if (unicode == NULL) {
3023 PyErr_BadInternalCall();
3024 return -1;
3025 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003028 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003029 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003030
3031 res = unicode_get_widechar_size(unicode);
3032 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003033 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003034 }
3035
3036 if (size > res) {
3037 size = res + 1;
3038 }
3039 else {
3040 res = size;
3041 }
3042 unicode_copy_as_widechar(unicode, w, size);
3043 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003044}
3045
Victor Stinner137c34c2010-09-29 10:25:54 +00003046wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003047PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003048 Py_ssize_t *size)
3049{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003050 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003051 Py_ssize_t buflen;
3052
3053 if (unicode == NULL) {
3054 PyErr_BadInternalCall();
3055 return NULL;
3056 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003057 if (!PyUnicode_Check(unicode)) {
3058 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003059 return NULL;
3060 }
3061
Serhiy Storchakac46db922018-10-23 22:58:24 +03003062 buflen = unicode_get_widechar_size(unicode);
3063 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003064 if (buffer == NULL) {
3065 PyErr_NoMemory();
3066 return NULL;
3067 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3069 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003070 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003071 }
3072 else if (wcslen(buffer) != (size_t)buflen) {
3073 PyMem_FREE(buffer);
3074 PyErr_SetString(PyExc_ValueError,
3075 "embedded null character");
3076 return NULL;
3077 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003078 return buffer;
3079}
3080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003081#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003085{
Victor Stinner8faf8212011-12-08 22:14:11 +01003086 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003087 PyErr_SetString(PyExc_ValueError,
3088 "chr() arg not in range(0x110000)");
3089 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003090 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003091
Victor Stinner985a82a2014-01-03 12:53:47 +01003092 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003093}
3094
Alexander Belopolsky40018472011-02-26 01:02:56 +00003095PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003096PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003098 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003100 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003101 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003102 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 Py_INCREF(obj);
3104 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003105 }
3106 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003107 /* For a Unicode subtype that's not a Unicode object,
3108 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003109 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003110 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003111 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003112 "Can't convert '%.100s' object to str implicitly",
3113 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003114 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003115}
3116
Alexander Belopolsky40018472011-02-26 01:02:56 +00003117PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003118PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003119 const char *encoding,
3120 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003121{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003122 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003123 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003124
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 PyErr_BadInternalCall();
3127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003129
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003130 /* Decoding bytes objects is the most common case and should be fast */
3131 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003132 if (PyBytes_GET_SIZE(obj) == 0)
3133 _Py_RETURN_UNICODE_EMPTY();
3134 v = PyUnicode_Decode(
3135 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3136 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003137 return v;
3138 }
3139
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003140 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 PyErr_SetString(PyExc_TypeError,
3142 "decoding str is not supported");
3143 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003144 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003145
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003146 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3147 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3148 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003149 "decoding to str: need a bytes-like object, %.80s found",
3150 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003151 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003152 }
Tim Petersced69f82003-09-16 20:30:58 +00003153
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003154 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003155 PyBuffer_Release(&buffer);
3156 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003158
Serhiy Storchaka05997252013-01-26 12:14:02 +02003159 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003160 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003161 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162}
3163
Victor Stinnerebe17e02016-10-12 13:57:45 +02003164/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3165 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3166 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003167int
3168_Py_normalize_encoding(const char *encoding,
3169 char *lower,
3170 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003172 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003173 char *l;
3174 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003175 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176
Victor Stinner942889a2016-09-05 15:40:10 -07003177 assert(encoding != NULL);
3178
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003179 e = encoding;
3180 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003181 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003182 punct = 0;
3183 while (1) {
3184 char c = *e;
3185 if (c == 0) {
3186 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003187 }
Victor Stinner942889a2016-09-05 15:40:10 -07003188
3189 if (Py_ISALNUM(c) || c == '.') {
3190 if (punct && l != lower) {
3191 if (l == l_end) {
3192 return 0;
3193 }
3194 *l++ = '_';
3195 }
3196 punct = 0;
3197
3198 if (l == l_end) {
3199 return 0;
3200 }
3201 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003202 }
3203 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003204 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003205 }
Victor Stinner942889a2016-09-05 15:40:10 -07003206
3207 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003208 }
3209 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003210 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003211}
3212
Alexander Belopolsky40018472011-02-26 01:02:56 +00003213PyObject *
3214PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003215 Py_ssize_t size,
3216 const char *encoding,
3217 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003218{
3219 PyObject *buffer = NULL, *unicode;
3220 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003221 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3222
3223 if (encoding == NULL) {
3224 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3225 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003226
Fred Drakee4315f52000-05-09 19:53:39 +00003227 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003228 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3229 char *lower = buflower;
3230
3231 /* Fast paths */
3232 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3233 lower += 3;
3234 if (*lower == '_') {
3235 /* Match "utf8" and "utf_8" */
3236 lower++;
3237 }
3238
3239 if (lower[0] == '8' && lower[1] == 0) {
3240 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3241 }
3242 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3243 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3244 }
3245 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3246 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3247 }
3248 }
3249 else {
3250 if (strcmp(lower, "ascii") == 0
3251 || strcmp(lower, "us_ascii") == 0) {
3252 return PyUnicode_DecodeASCII(s, size, errors);
3253 }
Steve Dowercc16be82016-09-08 10:35:16 -07003254 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003255 else if (strcmp(lower, "mbcs") == 0) {
3256 return PyUnicode_DecodeMBCS(s, size, errors);
3257 }
3258 #endif
3259 else if (strcmp(lower, "latin1") == 0
3260 || strcmp(lower, "latin_1") == 0
3261 || strcmp(lower, "iso_8859_1") == 0
3262 || strcmp(lower, "iso8859_1") == 0) {
3263 return PyUnicode_DecodeLatin1(s, size, errors);
3264 }
3265 }
Victor Stinner37296e82010-06-10 13:36:23 +00003266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267
3268 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003269 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003270 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003271 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003272 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 if (buffer == NULL)
3274 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003275 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 if (unicode == NULL)
3277 goto onError;
3278 if (!PyUnicode_Check(unicode)) {
3279 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003280 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003281 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003282 encoding,
3283 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 Py_DECREF(unicode);
3285 goto onError;
3286 }
3287 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003288 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003289
Benjamin Peterson29060642009-01-31 22:14:21 +00003290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 Py_XDECREF(buffer);
3292 return NULL;
3293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 const char *encoding,
3298 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003299{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003300 if (!PyUnicode_Check(unicode)) {
3301 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003302 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003303 }
3304
Serhiy Storchaka00939072016-10-27 21:05:49 +03003305 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3306 "PyUnicode_AsDecodedObject() is deprecated; "
3307 "use PyCodec_Decode() to decode from str", 1) < 0)
3308 return NULL;
3309
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003310 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003312
3313 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003314 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315}
3316
Alexander Belopolsky40018472011-02-26 01:02:56 +00003317PyObject *
3318PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003319 const char *encoding,
3320 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003321{
3322 PyObject *v;
3323
3324 if (!PyUnicode_Check(unicode)) {
3325 PyErr_BadArgument();
3326 goto onError;
3327 }
3328
Serhiy Storchaka00939072016-10-27 21:05:49 +03003329 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3330 "PyUnicode_AsDecodedUnicode() is deprecated; "
3331 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3332 return NULL;
3333
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003334 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003335 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003336
3337 /* Decode via the codec registry */
3338 v = PyCodec_Decode(unicode, encoding, errors);
3339 if (v == NULL)
3340 goto onError;
3341 if (!PyUnicode_Check(v)) {
3342 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003343 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003344 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003345 encoding,
3346 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003347 Py_DECREF(v);
3348 goto onError;
3349 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003350 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351
Benjamin Peterson29060642009-01-31 22:14:21 +00003352 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003353 return NULL;
3354}
3355
Alexander Belopolsky40018472011-02-26 01:02:56 +00003356PyObject *
3357PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003358 Py_ssize_t size,
3359 const char *encoding,
3360 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361{
3362 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003363
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003364 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3368 Py_DECREF(unicode);
3369 return v;
3370}
3371
Alexander Belopolsky40018472011-02-26 01:02:56 +00003372PyObject *
3373PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003374 const char *encoding,
3375 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003376{
3377 PyObject *v;
3378
3379 if (!PyUnicode_Check(unicode)) {
3380 PyErr_BadArgument();
3381 goto onError;
3382 }
3383
Serhiy Storchaka00939072016-10-27 21:05:49 +03003384 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3385 "PyUnicode_AsEncodedObject() is deprecated; "
3386 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3387 "or PyCodec_Encode() for generic encoding", 1) < 0)
3388 return NULL;
3389
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003390 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392
3393 /* Encode via the codec registry */
3394 v = PyCodec_Encode(unicode, encoding, errors);
3395 if (v == NULL)
3396 goto onError;
3397 return v;
3398
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003400 return NULL;
3401}
3402
Victor Stinner1b579672011-12-17 05:47:23 +01003403
Victor Stinner2cba6b82018-01-10 22:46:15 +01003404static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003405unicode_encode_locale(PyObject *unicode, const char *errors,
3406 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003407{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003408 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003409
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003410 Py_ssize_t wlen;
3411 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3412 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003413 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003414 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003415
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003416 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003417 if (wlen2 != wlen) {
3418 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003419 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420 return NULL;
3421 }
3422
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003423 char *str;
3424 size_t error_pos;
3425 const char *reason;
3426 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003427 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003428 if (res != 0) {
3429 if (res == -2) {
3430 PyObject *exc;
3431 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3432 "locale", unicode,
3433 (Py_ssize_t)error_pos,
3434 (Py_ssize_t)(error_pos+1),
3435 reason);
3436 if (exc != NULL) {
3437 PyCodec_StrictErrors(exc);
3438 Py_DECREF(exc);
3439 }
3440 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003441 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003442 else if (res == -3) {
3443 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3444 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003445 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003446 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 PyMem_Free(wstr);
3448 return NULL;
3449 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003450 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003451 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003452
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003453 PyObject *bytes = PyBytes_FromString(str);
3454 PyMem_RawFree(str);
3455 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003456}
3457
Victor Stinnerad158722010-10-27 00:25:46 +00003458PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003459PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3460{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003461 return unicode_encode_locale(unicode, errors, 1);
3462}
3463
3464PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003465PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003466{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003467 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003468 const _PyCoreConfig *config = &interp->core_config;
3469#if defined(__APPLE__)
3470 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3471#else
Victor Stinner793b5312011-04-27 00:24:21 +02003472 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3473 cannot use it to encode and decode filenames before it is loaded. Load
3474 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003475 implementation of the locale codec until the codec registry is
3476 initialized and the Python codec is loaded. See initfsencoding(). */
3477 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003478 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003479 config->filesystem_encoding,
3480 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003481 }
3482 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003483 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003484 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003485 }
Victor Stinnerad158722010-10-27 00:25:46 +00003486#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003487}
3488
Alexander Belopolsky40018472011-02-26 01:02:56 +00003489PyObject *
3490PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003491 const char *encoding,
3492 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493{
3494 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003495 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003496
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 if (!PyUnicode_Check(unicode)) {
3498 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 }
Fred Drakee4315f52000-05-09 19:53:39 +00003501
Victor Stinner942889a2016-09-05 15:40:10 -07003502 if (encoding == NULL) {
3503 return _PyUnicode_AsUTF8String(unicode, errors);
3504 }
3505
Fred Drakee4315f52000-05-09 19:53:39 +00003506 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003507 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3508 char *lower = buflower;
3509
3510 /* Fast paths */
3511 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3512 lower += 3;
3513 if (*lower == '_') {
3514 /* Match "utf8" and "utf_8" */
3515 lower++;
3516 }
3517
3518 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003520 }
3521 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3522 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3523 }
3524 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3525 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3526 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003527 }
Victor Stinner942889a2016-09-05 15:40:10 -07003528 else {
3529 if (strcmp(lower, "ascii") == 0
3530 || strcmp(lower, "us_ascii") == 0) {
3531 return _PyUnicode_AsASCIIString(unicode, errors);
3532 }
Steve Dowercc16be82016-09-08 10:35:16 -07003533#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003534 else if (strcmp(lower, "mbcs") == 0) {
3535 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3536 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003537#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003538 else if (strcmp(lower, "latin1") == 0 ||
3539 strcmp(lower, "latin_1") == 0 ||
3540 strcmp(lower, "iso_8859_1") == 0 ||
3541 strcmp(lower, "iso8859_1") == 0) {
3542 return _PyUnicode_AsLatin1String(unicode, errors);
3543 }
3544 }
Victor Stinner37296e82010-06-10 13:36:23 +00003545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546
3547 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003548 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003550 return NULL;
3551
3552 /* The normal path */
3553 if (PyBytes_Check(v))
3554 return v;
3555
3556 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003557 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003558 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003559 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003560
3561 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003562 "encoder %s returned bytearray instead of bytes; "
3563 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003564 encoding);
3565 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003566 Py_DECREF(v);
3567 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003568 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003569
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003570 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3571 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003572 Py_DECREF(v);
3573 return b;
3574 }
3575
3576 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003577 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003578 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003579 encoding,
3580 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003581 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003582 return NULL;
3583}
3584
Alexander Belopolsky40018472011-02-26 01:02:56 +00003585PyObject *
3586PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003587 const char *encoding,
3588 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003589{
3590 PyObject *v;
3591
3592 if (!PyUnicode_Check(unicode)) {
3593 PyErr_BadArgument();
3594 goto onError;
3595 }
3596
Serhiy Storchaka00939072016-10-27 21:05:49 +03003597 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3598 "PyUnicode_AsEncodedUnicode() is deprecated; "
3599 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3600 return NULL;
3601
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003602 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003603 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003604
3605 /* Encode via the codec registry */
3606 v = PyCodec_Encode(unicode, encoding, errors);
3607 if (v == NULL)
3608 goto onError;
3609 if (!PyUnicode_Check(v)) {
3610 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003611 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003612 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003613 encoding,
3614 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 Py_DECREF(v);
3616 goto onError;
3617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003619
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 return NULL;
3622}
3623
Victor Stinner2cba6b82018-01-10 22:46:15 +01003624static PyObject*
3625unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3626 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003627{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003628 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003629
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003630 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3631 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003632 return NULL;
3633 }
3634
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003635 wchar_t *wstr;
3636 size_t wlen;
3637 const char *reason;
3638 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003639 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003640 if (res != 0) {
3641 if (res == -2) {
3642 PyObject *exc;
3643 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3644 "locale", str, len,
3645 (Py_ssize_t)wlen,
3646 (Py_ssize_t)(wlen + 1),
3647 reason);
3648 if (exc != NULL) {
3649 PyCodec_StrictErrors(exc);
3650 Py_DECREF(exc);
3651 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003652 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003653 else if (res == -3) {
3654 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3655 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003656 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003657 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003658 }
Victor Stinner2f197072011-12-17 07:08:30 +01003659 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003660 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003661
3662 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3663 PyMem_RawFree(wstr);
3664 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003665}
3666
3667PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003668PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3669 const char *errors)
3670{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003671 return unicode_decode_locale(str, len, errors, 1);
3672}
3673
3674PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003675PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003676{
3677 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003678 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679}
3680
3681
3682PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003683PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003684 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003685 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3686}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687
Christian Heimes5894ba72007-11-04 11:43:14 +00003688PyObject*
3689PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3690{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003691 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003692 const _PyCoreConfig *config = &interp->core_config;
3693#if defined(__APPLE__)
3694 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3695#else
Victor Stinner793b5312011-04-27 00:24:21 +02003696 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3697 cannot use it to encode and decode filenames before it is loaded. Load
3698 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003699 implementation of the locale codec until the codec registry is
3700 initialized and the Python codec is loaded. See initfsencoding(). */
3701 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003702 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003703 config->filesystem_encoding,
3704 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003705 }
3706 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003707 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003708 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003709 }
Victor Stinnerad158722010-10-27 00:25:46 +00003710#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003711}
3712
Martin v. Löwis011e8422009-05-05 04:43:17 +00003713
3714int
3715PyUnicode_FSConverter(PyObject* arg, void* addr)
3716{
Brett Cannonec6ce872016-09-06 15:50:29 -07003717 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003718 PyObject *output = NULL;
3719 Py_ssize_t size;
3720 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003721 if (arg == NULL) {
3722 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003723 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003724 return 1;
3725 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003726 path = PyOS_FSPath(arg);
3727 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003728 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003729 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003730 if (PyBytes_Check(path)) {
3731 output = path;
3732 }
3733 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3734 output = PyUnicode_EncodeFSDefault(path);
3735 Py_DECREF(path);
3736 if (!output) {
3737 return 0;
3738 }
3739 assert(PyBytes_Check(output));
3740 }
3741
Victor Stinner0ea2a462010-04-30 00:22:08 +00003742 size = PyBytes_GET_SIZE(output);
3743 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003744 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003745 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003746 Py_DECREF(output);
3747 return 0;
3748 }
3749 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003750 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003751}
3752
3753
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003754int
3755PyUnicode_FSDecoder(PyObject* arg, void* addr)
3756{
Brett Cannona5711202016-09-06 19:36:01 -07003757 int is_buffer = 0;
3758 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003759 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003760 if (arg == NULL) {
3761 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003762 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003763 return 1;
3764 }
Brett Cannona5711202016-09-06 19:36:01 -07003765
3766 is_buffer = PyObject_CheckBuffer(arg);
3767 if (!is_buffer) {
3768 path = PyOS_FSPath(arg);
3769 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003770 return 0;
3771 }
Brett Cannona5711202016-09-06 19:36:01 -07003772 }
3773 else {
3774 path = arg;
3775 Py_INCREF(arg);
3776 }
3777
3778 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003779 output = path;
3780 }
3781 else if (PyBytes_Check(path) || is_buffer) {
3782 PyObject *path_bytes = NULL;
3783
3784 if (!PyBytes_Check(path) &&
3785 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003786 "path should be string, bytes, or os.PathLike, not %.200s",
3787 Py_TYPE(arg)->tp_name)) {
3788 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003789 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003790 }
3791 path_bytes = PyBytes_FromObject(path);
3792 Py_DECREF(path);
3793 if (!path_bytes) {
3794 return 0;
3795 }
3796 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3797 PyBytes_GET_SIZE(path_bytes));
3798 Py_DECREF(path_bytes);
3799 if (!output) {
3800 return 0;
3801 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003802 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003803 else {
3804 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003805 "path should be string, bytes, or os.PathLike, not %.200s",
3806 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003807 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003808 return 0;
3809 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003810 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003811 Py_DECREF(output);
3812 return 0;
3813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003815 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003816 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003817 Py_DECREF(output);
3818 return 0;
3819 }
3820 *(PyObject**)addr = output;
3821 return Py_CLEANUP_SUPPORTED;
3822}
3823
3824
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003825const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003827{
Christian Heimesf3863112007-11-22 07:46:41 +00003828 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003830 if (!PyUnicode_Check(unicode)) {
3831 PyErr_BadArgument();
3832 return NULL;
3833 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003837 if (PyUnicode_UTF8(unicode) == NULL) {
3838 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003839 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 if (bytes == NULL)
3841 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3843 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003844 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 Py_DECREF(bytes);
3846 return NULL;
3847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003849 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 PyBytes_AS_STRING(bytes),
3851 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852 Py_DECREF(bytes);
3853 }
3854
3855 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003856 *psize = PyUnicode_UTF8_LENGTH(unicode);
3857 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003858}
3859
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003860const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3864}
3865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866Py_UNICODE *
3867PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 if (!PyUnicode_Check(unicode)) {
3870 PyErr_BadArgument();
3871 return NULL;
3872 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003873 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3874 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003876 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003877 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003878
Serhiy Storchakac46db922018-10-23 22:58:24 +03003879 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3880 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3881 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003884 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3885 if (w == NULL) {
3886 PyErr_NoMemory();
3887 return NULL;
3888 }
3889 unicode_copy_as_widechar(unicode, w, wlen + 1);
3890 _PyUnicode_WSTR(unicode) = w;
3891 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3892 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 }
3894 }
3895 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003897 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00003898}
3899
Alexander Belopolsky40018472011-02-26 01:02:56 +00003900Py_UNICODE *
3901PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904}
3905
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003906const Py_UNICODE *
3907_PyUnicode_AsUnicode(PyObject *unicode)
3908{
3909 Py_ssize_t size;
3910 const Py_UNICODE *wstr;
3911
3912 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3913 if (wstr && wcslen(wstr) != (size_t)size) {
3914 PyErr_SetString(PyExc_ValueError, "embedded null character");
3915 return NULL;
3916 }
3917 return wstr;
3918}
3919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920
Alexander Belopolsky40018472011-02-26 01:02:56 +00003921Py_ssize_t
3922PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923{
3924 if (!PyUnicode_Check(unicode)) {
3925 PyErr_BadArgument();
3926 goto onError;
3927 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003928 if (_PyUnicode_WSTR(unicode) == NULL) {
3929 if (PyUnicode_AsUnicode(unicode) == NULL)
3930 goto onError;
3931 }
3932 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935 return -1;
3936}
3937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938Py_ssize_t
3939PyUnicode_GetLength(PyObject *unicode)
3940{
Victor Stinner07621332012-06-16 04:53:46 +02003941 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 PyErr_BadArgument();
3943 return -1;
3944 }
Victor Stinner07621332012-06-16 04:53:46 +02003945 if (PyUnicode_READY(unicode) == -1)
3946 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 return PyUnicode_GET_LENGTH(unicode);
3948}
3949
3950Py_UCS4
3951PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3952{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003953 void *data;
3954 int kind;
3955
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003956 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003957 PyErr_BadArgument();
3958 return (Py_UCS4)-1;
3959 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003960 if (PyUnicode_READY(unicode) == -1) {
3961 return (Py_UCS4)-1;
3962 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003964 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 return (Py_UCS4)-1;
3966 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003967 data = PyUnicode_DATA(unicode);
3968 kind = PyUnicode_KIND(unicode);
3969 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970}
3971
3972int
3973PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3974{
3975 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003976 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 return -1;
3978 }
Victor Stinner488fa492011-12-12 00:01:39 +01003979 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003980 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003981 PyErr_SetString(PyExc_IndexError, "string index out of range");
3982 return -1;
3983 }
Victor Stinner488fa492011-12-12 00:01:39 +01003984 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003985 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003986 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3987 PyErr_SetString(PyExc_ValueError, "character out of range");
3988 return -1;
3989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3991 index, ch);
3992 return 0;
3993}
3994
Alexander Belopolsky40018472011-02-26 01:02:56 +00003995const char *
3996PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003997{
Victor Stinner42cb4622010-09-01 19:39:01 +00003998 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003999}
4000
Victor Stinner554f3f02010-06-16 23:33:54 +00004001/* create or adjust a UnicodeDecodeError */
4002static void
4003make_decode_exception(PyObject **exceptionObject,
4004 const char *encoding,
4005 const char *input, Py_ssize_t length,
4006 Py_ssize_t startpos, Py_ssize_t endpos,
4007 const char *reason)
4008{
4009 if (*exceptionObject == NULL) {
4010 *exceptionObject = PyUnicodeDecodeError_Create(
4011 encoding, input, length, startpos, endpos, reason);
4012 }
4013 else {
4014 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4015 goto onError;
4016 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4017 goto onError;
4018 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4019 goto onError;
4020 }
4021 return;
4022
4023onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004024 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004025}
4026
Steve Dowercc16be82016-09-08 10:35:16 -07004027#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028/* error handling callback helper:
4029 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004030 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 and adjust various state variables.
4032 return 0 on success, -1 on error
4033*/
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004036unicode_decode_call_errorhandler_wchar(
4037 const char *errors, PyObject **errorHandler,
4038 const char *encoding, const char *reason,
4039 const char **input, const char **inend, Py_ssize_t *startinpos,
4040 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4041 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004043 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 PyObject *restuple = NULL;
4046 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004047 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t requiredsize;
4050 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004052 wchar_t *repwstr;
4053 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004055 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4056 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 *errorHandler = PyCodec_LookupError(errors);
4060 if (*errorHandler == NULL)
4061 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 }
4063
Victor Stinner554f3f02010-06-16 23:33:54 +00004064 make_decode_exception(exceptionObject,
4065 encoding,
4066 *input, *inend - *input,
4067 *startinpos, *endinpos,
4068 reason);
4069 if (*exceptionObject == NULL)
4070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004072 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004076 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004079 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004081
4082 /* Copy back the bytes variables, which might have been modified by the
4083 callback */
4084 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4085 if (!inputobj)
4086 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004087 *input = PyBytes_AS_STRING(inputobj);
4088 insize = PyBytes_GET_SIZE(inputobj);
4089 *inend = *input + insize;
4090 /* we can DECREF safely, as the exception has another reference,
4091 so the object won't go away. */
4092 Py_DECREF(inputobj);
4093
4094 if (newpos<0)
4095 newpos = insize+newpos;
4096 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004097 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004098 goto onError;
4099 }
4100
4101 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4102 if (repwstr == NULL)
4103 goto onError;
4104 /* need more space? (at least enough for what we
4105 have+the replacement+the rest of the string (starting
4106 at the new input position), so we won't have to check space
4107 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004108 requiredsize = *outpos;
4109 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4110 goto overflow;
4111 requiredsize += repwlen;
4112 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4113 goto overflow;
4114 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004115 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004116 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004117 requiredsize = 2*outsize;
4118 if (unicode_resize(output, requiredsize) < 0)
4119 goto onError;
4120 }
4121 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4122 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004123 *endinpos = newpos;
4124 *inptr = *input + newpos;
4125
4126 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004127 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128 return 0;
4129
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004130 overflow:
4131 PyErr_SetString(PyExc_OverflowError,
4132 "decoded result is too long for a Python string");
4133
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004134 onError:
4135 Py_XDECREF(restuple);
4136 return -1;
4137}
Steve Dowercc16be82016-09-08 10:35:16 -07004138#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139
4140static int
4141unicode_decode_call_errorhandler_writer(
4142 const char *errors, PyObject **errorHandler,
4143 const char *encoding, const char *reason,
4144 const char **input, const char **inend, Py_ssize_t *startinpos,
4145 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4146 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4147{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004148 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004149
4150 PyObject *restuple = NULL;
4151 PyObject *repunicode = NULL;
4152 Py_ssize_t insize;
4153 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004154 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004155 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004156 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004157 int need_to_grow = 0;
4158 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004159
4160 if (*errorHandler == NULL) {
4161 *errorHandler = PyCodec_LookupError(errors);
4162 if (*errorHandler == NULL)
4163 goto onError;
4164 }
4165
4166 make_decode_exception(exceptionObject,
4167 encoding,
4168 *input, *inend - *input,
4169 *startinpos, *endinpos,
4170 reason);
4171 if (*exceptionObject == NULL)
4172 goto onError;
4173
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004174 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004175 if (restuple == NULL)
4176 goto onError;
4177 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004178 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004179 goto onError;
4180 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004181 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004182 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004183
4184 /* Copy back the bytes variables, which might have been modified by the
4185 callback */
4186 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4187 if (!inputobj)
4188 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004189 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004190 *input = PyBytes_AS_STRING(inputobj);
4191 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004192 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004193 /* we can DECREF safely, as the exception has another reference,
4194 so the object won't go away. */
4195 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004199 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004200 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004202 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203
Victor Stinner170ca6f2013-04-18 00:25:28 +02004204 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004205 if (replen > 1) {
4206 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004207 need_to_grow = 1;
4208 }
4209 new_inptr = *input + newpos;
4210 if (*inend - new_inptr > remain) {
4211 /* We don't know the decoding algorithm here so we make the worst
4212 assumption that one byte decodes to one unicode character.
4213 If unfortunately one byte could decode to more unicode characters,
4214 the decoder may write out-of-bound then. Is it possible for the
4215 algorithms using this function? */
4216 writer->min_length += *inend - new_inptr - remain;
4217 need_to_grow = 1;
4218 }
4219 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004220 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004221 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004222 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4223 goto onError;
4224 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004225 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004226 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004229 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004232 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238}
4239
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004240/* --- UTF-7 Codec -------------------------------------------------------- */
4241
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242/* See RFC2152 for details. We encode conservatively and decode liberally. */
4243
4244/* Three simple macros defining base-64. */
4245
4246/* Is c a base-64 character? */
4247
4248#define IS_BASE64(c) \
4249 (((c) >= 'A' && (c) <= 'Z') || \
4250 ((c) >= 'a' && (c) <= 'z') || \
4251 ((c) >= '0' && (c) <= '9') || \
4252 (c) == '+' || (c) == '/')
4253
4254/* given that c is a base-64 character, what is its base-64 value? */
4255
4256#define FROM_BASE64(c) \
4257 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4258 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4259 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4260 (c) == '+' ? 62 : 63)
4261
4262/* What is the base-64 character of the bottom 6 bits of n? */
4263
4264#define TO_BASE64(n) \
4265 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4266
4267/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4268 * decoded as itself. We are permissive on decoding; the only ASCII
4269 * byte not decoding to itself is the + which begins a base64
4270 * string. */
4271
4272#define DECODE_DIRECT(c) \
4273 ((c) <= 127 && (c) != '+')
4274
4275/* The UTF-7 encoder treats ASCII characters differently according to
4276 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4277 * the above). See RFC2152. This array identifies these different
4278 * sets:
4279 * 0 : "Set D"
4280 * alphanumeric and '(),-./:?
4281 * 1 : "Set O"
4282 * !"#$%&*;<=>@[]^_`{|}
4283 * 2 : "whitespace"
4284 * ht nl cr sp
4285 * 3 : special (must be base64 encoded)
4286 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4287 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Tim Petersced69f82003-09-16 20:30:58 +00004289static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290char utf7_category[128] = {
4291/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4292 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4293/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4294 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4295/* sp ! " # $ % & ' ( ) * + , - . / */
4296 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4297/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4299/* @ A B C D E F G H I J K L M N O */
4300 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4301/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4303/* ` a b c d e f g h i j k l m n o */
4304 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4305/* p q r s t u v w x y z { | } ~ del */
4306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307};
4308
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309/* ENCODE_DIRECT: this character should be encoded as itself. The
4310 * answer depends on whether we are encoding set O as itself, and also
4311 * on whether we are encoding whitespace as itself. RFC2152 makes it
4312 * clear that the answers to these questions vary between
4313 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004314
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315#define ENCODE_DIRECT(c, directO, directWS) \
4316 ((c) < 128 && (c) > 0 && \
4317 ((utf7_category[(c)] == 0) || \
4318 (directWS && (utf7_category[(c)] == 2)) || \
4319 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320
Alexander Belopolsky40018472011-02-26 01:02:56 +00004321PyObject *
4322PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004323 Py_ssize_t size,
4324 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004326 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4327}
4328
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329/* The decoder. The only state we preserve is our read position,
4330 * i.e. how many characters we have consumed. So if we end in the
4331 * middle of a shift sequence we have to back off the read position
4332 * and the output to the beginning of the sequence, otherwise we lose
4333 * all the shift state (seen bits, number of bits seen, high
4334 * surrogate). */
4335
Alexander Belopolsky40018472011-02-26 01:02:56 +00004336PyObject *
4337PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004338 Py_ssize_t size,
4339 const char *errors,
4340 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004341{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004343 Py_ssize_t startinpos;
4344 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347 const char *errmsg = "";
4348 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004349 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 unsigned int base64bits = 0;
4351 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004352 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 PyObject *errorHandler = NULL;
4354 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004356 if (size == 0) {
4357 if (consumed)
4358 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004359 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004363 _PyUnicodeWriter_Init(&writer);
4364 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365
4366 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367 e = s + size;
4368
4369 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004370 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004372 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (inShift) { /* in a base-64 section */
4375 if (IS_BASE64(ch)) { /* consume a base-64 character */
4376 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4377 base64bits += 6;
4378 s++;
4379 if (base64bits >= 16) {
4380 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004381 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 base64bits -= 16;
4383 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004384 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 if (surrogate) {
4386 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004387 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4388 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004389 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004390 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004392 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 }
4394 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004395 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004396 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 }
4399 }
Victor Stinner551ac952011-11-29 22:58:13 +01004400 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 /* first surrogate */
4402 surrogate = outCh;
4403 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004405 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004406 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 }
4408 }
4409 }
4410 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 if (base64bits > 0) { /* left-over bits */
4413 if (base64bits >= 6) {
4414 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004415 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 errmsg = "partial character in shift sequence";
4417 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 else {
4420 /* Some bits remain; they should be zero */
4421 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004422 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 errmsg = "non-zero padding bits in shift sequence";
4424 goto utf7Error;
4425 }
4426 }
4427 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004428 if (surrogate && DECODE_DIRECT(ch)) {
4429 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4430 goto onError;
4431 }
4432 surrogate = 0;
4433 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434 /* '-' is absorbed; other terminating
4435 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004436 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439 }
4440 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 s++; /* consume '+' */
4443 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004445 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004446 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004448 else if (s < e && !IS_BASE64(*s)) {
4449 s++;
4450 errmsg = "ill-formed sequence";
4451 goto utf7Error;
4452 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004455 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004458 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 }
4460 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004463 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 else {
4467 startinpos = s-starts;
4468 s++;
4469 errmsg = "unexpected special character";
4470 goto utf7Error;
4471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004475 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 errors, &errorHandler,
4477 "utf7", errmsg,
4478 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004479 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481 }
4482
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483 /* end of string */
4484
4485 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4486 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004487 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 if (surrogate ||
4489 (base64bits >= 6) ||
4490 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 errors, &errorHandler,
4494 "utf7", "unterminated shift sequence",
4495 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004496 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 goto onError;
4498 if (s < e)
4499 goto restart;
4500 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502
4503 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004504 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004506 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004507 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004508 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004509 writer.kind, writer.data, shiftOutStart);
4510 Py_XDECREF(errorHandler);
4511 Py_XDECREF(exc);
4512 _PyUnicodeWriter_Dealloc(&writer);
4513 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004514 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004515 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 }
4517 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004518 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 Py_XDECREF(errorHandler);
4523 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 Py_XDECREF(errorHandler);
4528 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 return NULL;
4531}
4532
4533
Alexander Belopolsky40018472011-02-26 01:02:56 +00004534PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004535_PyUnicode_EncodeUTF7(PyObject *str,
4536 int base64SetO,
4537 int base64WhiteSpace,
4538 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004539{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004540 int kind;
4541 void *data;
4542 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004543 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004545 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 unsigned int base64bits = 0;
4547 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 char * out;
4549 char * start;
4550
Benjamin Petersonbac79492012-01-14 13:34:47 -05004551 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004552 return NULL;
4553 kind = PyUnicode_KIND(str);
4554 data = PyUnicode_DATA(str);
4555 len = PyUnicode_GET_LENGTH(str);
4556
4557 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004560 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004561 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004562 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004563 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004564 if (v == NULL)
4565 return NULL;
4566
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004567 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004568 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004569 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004570
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 if (inShift) {
4572 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4573 /* shifting out */
4574 if (base64bits) { /* output remaining bits */
4575 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4576 base64buffer = 0;
4577 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 }
4579 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 /* Characters not in the BASE64 set implicitly unshift the sequence
4581 so no '-' is required, except if the character is itself a '-' */
4582 if (IS_BASE64(ch) || ch == '-') {
4583 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 *out++ = (char) ch;
4586 }
4587 else {
4588 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004589 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 else { /* not in a shift sequence */
4592 if (ch == '+') {
4593 *out++ = '+';
4594 *out++ = '-';
4595 }
4596 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4597 *out++ = (char) ch;
4598 }
4599 else {
4600 *out++ = '+';
4601 inShift = 1;
4602 goto encode_char;
4603 }
4604 }
4605 continue;
4606encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004608 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004609
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 /* code first surrogate */
4611 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004612 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 while (base64bits >= 6) {
4614 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4615 base64bits -= 6;
4616 }
4617 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004618 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 base64bits += 16;
4621 base64buffer = (base64buffer << 16) | ch;
4622 while (base64bits >= 6) {
4623 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4624 base64bits -= 6;
4625 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 if (base64bits)
4628 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4629 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004631 if (_PyBytes_Resize(&v, out - start) < 0)
4632 return NULL;
4633 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004635PyObject *
4636PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4637 Py_ssize_t size,
4638 int base64SetO,
4639 int base64WhiteSpace,
4640 const char *errors)
4641{
4642 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004643 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004644 if (tmp == NULL)
4645 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004646 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004647 base64WhiteSpace, errors);
4648 Py_DECREF(tmp);
4649 return result;
4650}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652#undef IS_BASE64
4653#undef FROM_BASE64
4654#undef TO_BASE64
4655#undef DECODE_DIRECT
4656#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658/* --- UTF-8 Codec -------------------------------------------------------- */
4659
Alexander Belopolsky40018472011-02-26 01:02:56 +00004660PyObject *
4661PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004662 Py_ssize_t size,
4663 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664{
Walter Dörwald69652032004-09-07 20:24:22 +00004665 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4666}
4667
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004668#include "stringlib/asciilib.h"
4669#include "stringlib/codecs.h"
4670#include "stringlib/undef.h"
4671
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004672#include "stringlib/ucs1lib.h"
4673#include "stringlib/codecs.h"
4674#include "stringlib/undef.h"
4675
4676#include "stringlib/ucs2lib.h"
4677#include "stringlib/codecs.h"
4678#include "stringlib/undef.h"
4679
4680#include "stringlib/ucs4lib.h"
4681#include "stringlib/codecs.h"
4682#include "stringlib/undef.h"
4683
Antoine Pitrouab868312009-01-10 15:40:25 +00004684/* Mask to quickly check whether a C 'long' contains a
4685 non-ASCII, UTF8-encoded char. */
4686#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004687# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004688#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004689# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004690#else
4691# error C 'long' size should be either 4 or 8!
4692#endif
4693
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694static Py_ssize_t
4695ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004696{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004698 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004699
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004700 /*
4701 * Issue #17237: m68k is a bit different from most architectures in
4702 * that objects do not use "natural alignment" - for example, int and
4703 * long are only aligned at 2-byte boundaries. Therefore the assert()
4704 * won't work; also, tests have shown that skipping the "optimised
4705 * version" will even speed up m68k.
4706 */
4707#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004709 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4710 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 /* Fast path, see in STRINGLIB(utf8_decode) for
4712 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004713 /* Help allocation */
4714 const char *_p = p;
4715 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 while (_p < aligned_end) {
4717 unsigned long value = *(const unsigned long *) _p;
4718 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 *((unsigned long *)q) = value;
4721 _p += SIZEOF_LONG;
4722 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004723 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 p = _p;
4725 while (p < end) {
4726 if ((unsigned char)*p & 0x80)
4727 break;
4728 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004733#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 while (p < end) {
4735 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4736 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004737 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004738 /* Help allocation */
4739 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 while (_p < aligned_end) {
4741 unsigned long value = *(unsigned long *) _p;
4742 if (value & ASCII_CHAR_MASK)
4743 break;
4744 _p += SIZEOF_LONG;
4745 }
4746 p = _p;
4747 if (_p == end)
4748 break;
4749 }
4750 if ((unsigned char)*p & 0x80)
4751 break;
4752 ++p;
4753 }
4754 memcpy(dest, start, p - start);
4755 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756}
Antoine Pitrouab868312009-01-10 15:40:25 +00004757
Victor Stinner785938e2011-12-11 20:09:03 +01004758PyObject *
4759PyUnicode_DecodeUTF8Stateful(const char *s,
4760 Py_ssize_t size,
4761 const char *errors,
4762 Py_ssize_t *consumed)
4763{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004765 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767
4768 Py_ssize_t startinpos;
4769 Py_ssize_t endinpos;
4770 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004771 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004773 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004774
4775 if (size == 0) {
4776 if (consumed)
4777 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004778 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004779 }
4780
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004781 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4782 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004783 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 *consumed = 1;
4785 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004786 }
4787
Victor Stinner8f674cc2013-04-17 23:02:17 +02004788 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004789 writer.min_length = size;
4790 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004791 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004792
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 writer.pos = ascii_decode(s, end, writer.data);
4794 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004795 while (s < end) {
4796 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004797 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004798
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004800 if (PyUnicode_IS_ASCII(writer.buffer))
4801 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806 } else {
4807 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 }
4810
4811 switch (ch) {
4812 case 0:
4813 if (s == end || consumed)
4814 goto End;
4815 errmsg = "unexpected end of data";
4816 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004817 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 break;
4819 case 1:
4820 errmsg = "invalid start byte";
4821 startinpos = s - starts;
4822 endinpos = startinpos + 1;
4823 break;
4824 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004825 case 3:
4826 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827 errmsg = "invalid continuation byte";
4828 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004829 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 break;
4831 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004832 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 goto onError;
4834 continue;
4835 }
4836
Victor Stinner1d65d912015-10-05 13:43:50 +02004837 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004838 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004839
4840 switch (error_handler) {
4841 case _Py_ERROR_IGNORE:
4842 s += (endinpos - startinpos);
4843 break;
4844
4845 case _Py_ERROR_REPLACE:
4846 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4847 goto onError;
4848 s += (endinpos - startinpos);
4849 break;
4850
4851 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004852 {
4853 Py_ssize_t i;
4854
Victor Stinner1d65d912015-10-05 13:43:50 +02004855 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4856 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004857 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004858 ch = (Py_UCS4)(unsigned char)(starts[i]);
4859 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4860 ch + 0xdc00);
4861 writer.pos++;
4862 }
4863 s += (endinpos - startinpos);
4864 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004865 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004866
4867 default:
4868 if (unicode_decode_call_errorhandler_writer(
4869 errors, &error_handler_obj,
4870 "utf-8", errmsg,
4871 &starts, &end, &startinpos, &endinpos, &exc, &s,
4872 &writer))
4873 goto onError;
4874 }
Victor Stinner785938e2011-12-11 20:09:03 +01004875 }
4876
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 if (consumed)
4879 *consumed = s - starts;
4880
Victor Stinner1d65d912015-10-05 13:43:50 +02004881 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004883 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884
4885onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004886 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004888 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004890}
4891
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004892
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004893/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4894 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004895
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004896 On success, write a pointer to a newly allocated wide character string into
4897 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4898 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004899
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004900 On memory allocation failure, return -1.
4901
4902 On decoding error (if surrogateescape is zero), return -2. If wlen is
4903 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4904 is not NULL, write the decoding error message into *reason. */
4905int
4906_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004907 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004908{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004909 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004910 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 wchar_t *unicode;
4912 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004913
Victor Stinner3d4226a2018-08-29 22:21:32 +02004914 int surrogateescape = 0;
4915 int surrogatepass = 0;
4916 switch (errors)
4917 {
4918 case _Py_ERROR_STRICT:
4919 break;
4920 case _Py_ERROR_SURROGATEESCAPE:
4921 surrogateescape = 1;
4922 break;
4923 case _Py_ERROR_SURROGATEPASS:
4924 surrogatepass = 1;
4925 break;
4926 default:
4927 return -3;
4928 }
4929
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004930 /* Note: size will always be longer than the resulting Unicode
4931 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004932 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004933 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004934 }
4935
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004936 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004937 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004938 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004939 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004940
4941 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004942 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004944 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004948#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004950#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 if (ch > 0xFF) {
4952#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004953 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004955 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004956 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4958 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4959#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004960 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004962 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004964 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004965
4966 if (surrogateescape) {
4967 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4968 }
4969 else {
4970 /* Is it a valid three-byte code? */
4971 if (surrogatepass
4972 && (e - s) >= 3
4973 && (s[0] & 0xf0) == 0xe0
4974 && (s[1] & 0xc0) == 0x80
4975 && (s[2] & 0xc0) == 0x80)
4976 {
4977 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4978 s += 3;
4979 unicode[outpos++] = ch;
4980 }
4981 else {
4982 PyMem_RawFree(unicode );
4983 if (reason != NULL) {
4984 switch (ch) {
4985 case 0:
4986 *reason = "unexpected end of data";
4987 break;
4988 case 1:
4989 *reason = "invalid start byte";
4990 break;
4991 /* 2, 3, 4 */
4992 default:
4993 *reason = "invalid continuation byte";
4994 break;
4995 }
4996 }
4997 if (wlen != NULL) {
4998 *wlen = s - orig_s;
4999 }
5000 return -2;
5001 }
5002 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005006 if (wlen) {
5007 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005008 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005009 *wstr = unicode;
5010 return 0;
5011}
5012
5013wchar_t*
5014_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5015{
5016 wchar_t *wstr;
5017 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5018 if (res != 0) {
5019 return NULL;
5020 }
5021 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005022}
5023
Antoine Pitrouab868312009-01-10 15:40:25 +00005024
Victor Stinnere47e6982017-12-21 15:45:16 +01005025/* UTF-8 encoder using the surrogateescape error handler .
5026
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005027 On success, return 0 and write the newly allocated character string (use
5028 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005029
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005030 On encoding failure, return -2 and write the position of the invalid
5031 surrogate character into *error_pos (if error_pos is set) and the decoding
5032 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005033
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005034 On memory allocation failure, return -1. */
5035int
5036_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005037 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005038{
5039 const Py_ssize_t max_char_size = 4;
5040 Py_ssize_t len = wcslen(text);
5041
5042 assert(len >= 0);
5043
Victor Stinner3d4226a2018-08-29 22:21:32 +02005044 int surrogateescape = 0;
5045 int surrogatepass = 0;
5046 switch (errors)
5047 {
5048 case _Py_ERROR_STRICT:
5049 break;
5050 case _Py_ERROR_SURROGATEESCAPE:
5051 surrogateescape = 1;
5052 break;
5053 case _Py_ERROR_SURROGATEPASS:
5054 surrogatepass = 1;
5055 break;
5056 default:
5057 return -3;
5058 }
5059
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005060 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5061 return -1;
5062 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005063 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005064 if (raw_malloc) {
5065 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005066 }
5067 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005068 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005069 }
5070 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005071 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005072 }
5073
5074 char *p = bytes;
5075 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005076 for (i = 0; i < len; ) {
5077 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005078 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005079 i++;
5080#if Py_UNICODE_SIZE == 2
5081 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5082 && i < len
5083 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5084 {
5085 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5086 i++;
5087 }
5088#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005089
5090 if (ch < 0x80) {
5091 /* Encode ASCII */
5092 *p++ = (char) ch;
5093
5094 }
5095 else if (ch < 0x0800) {
5096 /* Encode Latin-1 */
5097 *p++ = (char)(0xc0 | (ch >> 6));
5098 *p++ = (char)(0x80 | (ch & 0x3f));
5099 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005100 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005101 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005103 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005104 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005105 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005106 if (reason != NULL) {
5107 *reason = "encoding error";
5108 }
5109 if (raw_malloc) {
5110 PyMem_RawFree(bytes);
5111 }
5112 else {
5113 PyMem_Free(bytes);
5114 }
5115 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005116 }
5117 *p++ = (char)(ch & 0xff);
5118 }
5119 else if (ch < 0x10000) {
5120 *p++ = (char)(0xe0 | (ch >> 12));
5121 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5122 *p++ = (char)(0x80 | (ch & 0x3f));
5123 }
5124 else { /* ch >= 0x10000 */
5125 assert(ch <= MAX_UNICODE);
5126 /* Encode UCS4 Unicode ordinals */
5127 *p++ = (char)(0xf0 | (ch >> 18));
5128 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5129 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5130 *p++ = (char)(0x80 | (ch & 0x3f));
5131 }
5132 }
5133 *p++ = '\0';
5134
5135 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005136 char *bytes2;
5137 if (raw_malloc) {
5138 bytes2 = PyMem_RawRealloc(bytes, final_size);
5139 }
5140 else {
5141 bytes2 = PyMem_Realloc(bytes, final_size);
5142 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005143 if (bytes2 == NULL) {
5144 if (error_pos != NULL) {
5145 *error_pos = (size_t)-1;
5146 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005147 if (raw_malloc) {
5148 PyMem_RawFree(bytes);
5149 }
5150 else {
5151 PyMem_Free(bytes);
5152 }
5153 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005154 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005155 *str = bytes2;
5156 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005157}
5158
5159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005160/* Primary internal function which creates utf8 encoded bytes objects.
5161
5162 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005163 and allocate exactly as much space needed at the end. Else allocate the
5164 maximum possible needed (4 result bytes per Unicode character), and return
5165 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005166*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005167PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005168_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Victor Stinner6099a032011-12-18 14:22:26 +01005170 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005171 void *data;
5172 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174 if (!PyUnicode_Check(unicode)) {
5175 PyErr_BadArgument();
5176 return NULL;
5177 }
5178
5179 if (PyUnicode_READY(unicode) == -1)
5180 return NULL;
5181
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005182 if (PyUnicode_UTF8(unicode))
5183 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5184 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005185
5186 kind = PyUnicode_KIND(unicode);
5187 data = PyUnicode_DATA(unicode);
5188 size = PyUnicode_GET_LENGTH(unicode);
5189
Benjamin Petersonead6b532011-12-20 17:23:42 -06005190 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005191 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005192 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005193 case PyUnicode_1BYTE_KIND:
5194 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5195 assert(!PyUnicode_IS_ASCII(unicode));
5196 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5197 case PyUnicode_2BYTE_KIND:
5198 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5199 case PyUnicode_4BYTE_KIND:
5200 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}
5203
Alexander Belopolsky40018472011-02-26 01:02:56 +00005204PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5206 Py_ssize_t size,
5207 const char *errors)
5208{
5209 PyObject *v, *unicode;
5210
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005211 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005212 if (unicode == NULL)
5213 return NULL;
5214 v = _PyUnicode_AsUTF8String(unicode, errors);
5215 Py_DECREF(unicode);
5216 return v;
5217}
5218
5219PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005220PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005222 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223}
5224
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225/* --- UTF-32 Codec ------------------------------------------------------- */
5226
5227PyObject *
5228PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 Py_ssize_t size,
5230 const char *errors,
5231 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232{
5233 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5234}
5235
5236PyObject *
5237PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 Py_ssize_t size,
5239 const char *errors,
5240 int *byteorder,
5241 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242{
5243 const char *starts = s;
5244 Py_ssize_t startinpos;
5245 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005247 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005249 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251 PyObject *errorHandler = NULL;
5252 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005253
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 q = (unsigned char *)s;
5255 e = q + size;
5256
5257 if (byteorder)
5258 bo = *byteorder;
5259
5260 /* Check for BOM marks (U+FEFF) in the input and adjust current
5261 byte order setting accordingly. In native mode, the leading BOM
5262 mark is skipped, in all other modes, it is copied to the output
5263 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005265 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005266 if (bom == 0x0000FEFF) {
5267 bo = -1;
5268 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005270 else if (bom == 0xFFFE0000) {
5271 bo = 1;
5272 q += 4;
5273 }
5274 if (byteorder)
5275 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276 }
5277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278 if (q == e) {
5279 if (consumed)
5280 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005281 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282 }
5283
Victor Stinnere64322e2012-10-30 23:12:47 +01005284#ifdef WORDS_BIGENDIAN
5285 le = bo < 0;
5286#else
5287 le = bo <= 0;
5288#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005289 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005290
Victor Stinner8f674cc2013-04-17 23:02:17 +02005291 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005292 writer.min_length = (e - q + 3) / 4;
5293 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005294 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005295
Victor Stinnere64322e2012-10-30 23:12:47 +01005296 while (1) {
5297 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005298 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005299
Victor Stinnere64322e2012-10-30 23:12:47 +01005300 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005301 enum PyUnicode_Kind kind = writer.kind;
5302 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 if (le) {
5306 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005307 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 if (ch > maxch)
5309 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005310 if (kind != PyUnicode_1BYTE_KIND &&
5311 Py_UNICODE_IS_SURROGATE(ch))
5312 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005313 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 q += 4;
5315 } while (q <= last);
5316 }
5317 else {
5318 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005319 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 if (ch > maxch)
5321 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005322 if (kind != PyUnicode_1BYTE_KIND &&
5323 Py_UNICODE_IS_SURROGATE(ch))
5324 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005325 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005326 q += 4;
5327 } while (q <= last);
5328 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005329 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005330 }
5331
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005332 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005333 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005334 startinpos = ((const char *)q) - starts;
5335 endinpos = startinpos + 4;
5336 }
5337 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005338 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005342 startinpos = ((const char *)q) - starts;
5343 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005345 else {
5346 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005347 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005348 goto onError;
5349 q += 4;
5350 continue;
5351 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005352 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005353 startinpos = ((const char *)q) - starts;
5354 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005356
5357 /* The remaining input chars are ignored if the callback
5358 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005359 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005361 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005363 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 }
5366
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005369
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 Py_XDECREF(errorHandler);
5371 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005372 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005373
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005375 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005376 Py_XDECREF(errorHandler);
5377 Py_XDECREF(exc);
5378 return NULL;
5379}
5380
5381PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005382_PyUnicode_EncodeUTF32(PyObject *str,
5383 const char *errors,
5384 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005385{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005386 enum PyUnicode_Kind kind;
5387 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005388 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005389 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005390 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005391#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005392 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005394 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 PyObject *errorHandler = NULL;
5399 PyObject *exc = NULL;
5400 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005401
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402 if (!PyUnicode_Check(str)) {
5403 PyErr_BadArgument();
5404 return NULL;
5405 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005406 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 return NULL;
5408 kind = PyUnicode_KIND(str);
5409 data = PyUnicode_DATA(str);
5410 len = PyUnicode_GET_LENGTH(str);
5411
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005413 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005415 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416 if (v == NULL)
5417 return NULL;
5418
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005419 /* output buffer is 4-bytes aligned */
5420 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005421 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005422 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005424 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005426
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005427 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005430 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 else
5432 encoding = "utf-32";
5433
5434 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005435 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5436 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005437 }
5438
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005439 pos = 0;
5440 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005442
5443 if (kind == PyUnicode_2BYTE_KIND) {
5444 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5445 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005447 else {
5448 assert(kind == PyUnicode_4BYTE_KIND);
5449 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5450 &out, native_ordering);
5451 }
5452 if (pos == len)
5453 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005454
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005455 rep = unicode_encode_call_errorhandler(
5456 errors, &errorHandler,
5457 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005458 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 if (!rep)
5460 goto error;
5461
5462 if (PyBytes_Check(rep)) {
5463 repsize = PyBytes_GET_SIZE(rep);
5464 if (repsize & 3) {
5465 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005466 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 "surrogates not allowed");
5468 goto error;
5469 }
5470 moreunits = repsize / 4;
5471 }
5472 else {
5473 assert(PyUnicode_Check(rep));
5474 if (PyUnicode_READY(rep) < 0)
5475 goto error;
5476 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5477 if (!PyUnicode_IS_ASCII(rep)) {
5478 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005479 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005480 "surrogates not allowed");
5481 goto error;
5482 }
5483 }
5484
5485 /* four bytes are reserved for each surrogate */
5486 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005487 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005488 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 /* integer overflow */
5490 PyErr_NoMemory();
5491 goto error;
5492 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005493 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005495 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 }
5497
5498 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005499 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005500 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005501 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005502 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005503 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5504 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005505 }
5506
5507 Py_CLEAR(rep);
5508 }
5509
5510 /* Cut back to size actually needed. This is necessary for, for example,
5511 encoding of a string containing isolated surrogates and the 'ignore'
5512 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005513 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 if (nsize != PyBytes_GET_SIZE(v))
5515 _PyBytes_Resize(&v, nsize);
5516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005518 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005519 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005520 error:
5521 Py_XDECREF(rep);
5522 Py_XDECREF(errorHandler);
5523 Py_XDECREF(exc);
5524 Py_XDECREF(v);
5525 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005526}
5527
Alexander Belopolsky40018472011-02-26 01:02:56 +00005528PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005529PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5530 Py_ssize_t size,
5531 const char *errors,
5532 int byteorder)
5533{
5534 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005535 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005536 if (tmp == NULL)
5537 return NULL;
5538 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5539 Py_DECREF(tmp);
5540 return result;
5541}
5542
5543PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005544PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005545{
Victor Stinnerb960b342011-11-20 19:12:52 +01005546 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005547}
5548
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549/* --- UTF-16 Codec ------------------------------------------------------- */
5550
Tim Peters772747b2001-08-09 22:21:55 +00005551PyObject *
5552PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 Py_ssize_t size,
5554 const char *errors,
5555 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556{
Walter Dörwald69652032004-09-07 20:24:22 +00005557 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5558}
5559
5560PyObject *
5561PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 Py_ssize_t size,
5563 const char *errors,
5564 int *byteorder,
5565 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005568 Py_ssize_t startinpos;
5569 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005570 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005571 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005572 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005573 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005574 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005575 PyObject *errorHandler = NULL;
5576 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005577 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Tim Peters772747b2001-08-09 22:21:55 +00005579 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005580 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
5582 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005583 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005585 /* Check for BOM marks (U+FEFF) in the input and adjust current
5586 byte order setting accordingly. In native mode, the leading BOM
5587 mark is skipped, in all other modes, it is copied to the output
5588 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 if (bo == 0 && size >= 2) {
5590 const Py_UCS4 bom = (q[1] << 8) | q[0];
5591 if (bom == 0xFEFF) {
5592 q += 2;
5593 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595 else if (bom == 0xFFFE) {
5596 q += 2;
5597 bo = 1;
5598 }
5599 if (byteorder)
5600 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603 if (q == e) {
5604 if (consumed)
5605 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005606 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005607 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608
Christian Heimes743e0cd2012-10-17 23:52:17 +02005609#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005610 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005611 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005612#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005614 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005615#endif
Tim Peters772747b2001-08-09 22:21:55 +00005616
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005618 character count normally. Error handler will take care of
5619 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005620 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005621 writer.min_length = (e - q + 1) / 2;
5622 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 while (1) {
5626 Py_UCS4 ch = 0;
5627 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005631 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005633 native_ordering);
5634 else
5635 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005637 native_ordering);
5638 } else if (kind == PyUnicode_2BYTE_KIND) {
5639 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005640 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005641 native_ordering);
5642 } else {
5643 assert(kind == PyUnicode_4BYTE_KIND);
5644 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005646 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005647 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005648 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649
Antoine Pitrou63065d72012-05-15 23:48:04 +02005650 switch (ch)
5651 {
5652 case 0:
5653 /* remaining byte at the end? (size should be even) */
5654 if (q == e || consumed)
5655 goto End;
5656 errmsg = "truncated data";
5657 startinpos = ((const char *)q) - starts;
5658 endinpos = ((const char *)e) - starts;
5659 break;
5660 /* The remaining input chars are ignored if the callback
5661 chooses to skip the input */
5662 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005663 q -= 2;
5664 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005665 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005666 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005667 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005668 endinpos = ((const char *)e) - starts;
5669 break;
5670 case 2:
5671 errmsg = "illegal encoding";
5672 startinpos = ((const char *)q) - 2 - starts;
5673 endinpos = startinpos + 2;
5674 break;
5675 case 3:
5676 errmsg = "illegal UTF-16 surrogate";
5677 startinpos = ((const char *)q) - 4 - starts;
5678 endinpos = startinpos + 2;
5679 break;
5680 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005681 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005682 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 continue;
5684 }
5685
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005687 errors,
5688 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005689 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005690 &starts,
5691 (const char **)&e,
5692 &startinpos,
5693 &endinpos,
5694 &exc,
5695 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005696 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 }
5699
Antoine Pitrou63065d72012-05-15 23:48:04 +02005700End:
Walter Dörwald69652032004-09-07 20:24:22 +00005701 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 Py_XDECREF(errorHandler);
5705 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005706 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005709 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 Py_XDECREF(errorHandler);
5711 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 return NULL;
5713}
5714
Tim Peters772747b2001-08-09 22:21:55 +00005715PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716_PyUnicode_EncodeUTF16(PyObject *str,
5717 const char *errors,
5718 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 enum PyUnicode_Kind kind;
5721 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005722 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005723 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005724 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005726#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005727 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005728#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005729 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005730#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 const char *encoding;
5732 Py_ssize_t nsize, pos;
5733 PyObject *errorHandler = NULL;
5734 PyObject *exc = NULL;
5735 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005736
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 if (!PyUnicode_Check(str)) {
5738 PyErr_BadArgument();
5739 return NULL;
5740 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005741 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 return NULL;
5743 kind = PyUnicode_KIND(str);
5744 data = PyUnicode_DATA(str);
5745 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005746
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005748 if (kind == PyUnicode_4BYTE_KIND) {
5749 const Py_UCS4 *in = (const Py_UCS4 *)data;
5750 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 while (in < end) {
5752 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 }
5755 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005756 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 nsize = len + pairs + (byteorder == 0);
5761 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005766 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005767 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005768 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005770 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005771 }
5772 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 }
Tim Peters772747b2001-08-09 22:21:55 +00005775
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005776 if (kind == PyUnicode_1BYTE_KIND) {
5777 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5778 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005779 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005780
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005781 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005782 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005783 }
5784 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005785 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005786 }
5787 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005788 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005789 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790
5791 pos = 0;
5792 while (pos < len) {
5793 Py_ssize_t repsize, moreunits;
5794
5795 if (kind == PyUnicode_2BYTE_KIND) {
5796 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5797 &out, native_ordering);
5798 }
5799 else {
5800 assert(kind == PyUnicode_4BYTE_KIND);
5801 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5802 &out, native_ordering);
5803 }
5804 if (pos == len)
5805 break;
5806
5807 rep = unicode_encode_call_errorhandler(
5808 errors, &errorHandler,
5809 encoding, "surrogates not allowed",
5810 str, &exc, pos, pos + 1, &pos);
5811 if (!rep)
5812 goto error;
5813
5814 if (PyBytes_Check(rep)) {
5815 repsize = PyBytes_GET_SIZE(rep);
5816 if (repsize & 1) {
5817 raise_encode_exception(&exc, encoding,
5818 str, pos - 1, pos,
5819 "surrogates not allowed");
5820 goto error;
5821 }
5822 moreunits = repsize / 2;
5823 }
5824 else {
5825 assert(PyUnicode_Check(rep));
5826 if (PyUnicode_READY(rep) < 0)
5827 goto error;
5828 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5829 if (!PyUnicode_IS_ASCII(rep)) {
5830 raise_encode_exception(&exc, encoding,
5831 str, pos - 1, pos,
5832 "surrogates not allowed");
5833 goto error;
5834 }
5835 }
5836
5837 /* two bytes are reserved for each surrogate */
5838 if (moreunits > 1) {
5839 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005840 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005841 /* integer overflow */
5842 PyErr_NoMemory();
5843 goto error;
5844 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005845 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 goto error;
5847 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5848 }
5849
5850 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005851 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005852 out += moreunits;
5853 } else /* rep is unicode */ {
5854 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5855 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5856 &out, native_ordering);
5857 }
5858
5859 Py_CLEAR(rep);
5860 }
5861
5862 /* Cut back to size actually needed. This is necessary for, for example,
5863 encoding of a string containing isolated surrogates and the 'ignore' handler
5864 is used. */
5865 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5866 if (nsize != PyBytes_GET_SIZE(v))
5867 _PyBytes_Resize(&v, nsize);
5868 Py_XDECREF(errorHandler);
5869 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005870 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005871 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005872 error:
5873 Py_XDECREF(rep);
5874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
5876 Py_XDECREF(v);
5877 return NULL;
5878#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879}
5880
Alexander Belopolsky40018472011-02-26 01:02:56 +00005881PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5883 Py_ssize_t size,
5884 const char *errors,
5885 int byteorder)
5886{
5887 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005888 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 if (tmp == NULL)
5890 return NULL;
5891 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5892 Py_DECREF(tmp);
5893 return result;
5894}
5895
5896PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005897PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005899 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900}
5901
5902/* --- Unicode Escape Codec ----------------------------------------------- */
5903
Fredrik Lundh06d12682001-01-24 07:59:11 +00005904static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005905
Alexander Belopolsky40018472011-02-26 01:02:56 +00005906PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005907_PyUnicode_DecodeUnicodeEscape(const char *s,
5908 Py_ssize_t size,
5909 const char *errors,
5910 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005913 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 PyObject *errorHandler = NULL;
5916 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005917
Eric V. Smith42454af2016-10-31 09:22:08 -04005918 // so we can remember if we've seen an invalid escape char or not
5919 *first_invalid_escape = NULL;
5920
Victor Stinner62ec3312016-09-06 17:04:34 -07005921 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005922 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005923 }
5924 /* Escaped strings will always be longer than the resulting
5925 Unicode string, so we start with size here and then reduce the
5926 length after conversion to the true value.
5927 (but if the error callback returns a long replacement string
5928 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005929 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005930 writer.min_length = size;
5931 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5932 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005933 }
5934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 end = s + size;
5936 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005937 unsigned char c = (unsigned char) *s++;
5938 Py_UCS4 ch;
5939 int count;
5940 Py_ssize_t startinpos;
5941 Py_ssize_t endinpos;
5942 const char *message;
5943
5944#define WRITE_ASCII_CHAR(ch) \
5945 do { \
5946 assert(ch <= 127); \
5947 assert(writer.pos < writer.size); \
5948 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5949 } while(0)
5950
5951#define WRITE_CHAR(ch) \
5952 do { \
5953 if (ch <= writer.maxchar) { \
5954 assert(writer.pos < writer.size); \
5955 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5956 } \
5957 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5958 goto onError; \
5959 } \
5960 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
5962 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 if (c != '\\') {
5964 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 continue;
5966 }
5967
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 if (s >= end) {
5971 message = "\\ at end of string";
5972 goto error;
5973 }
5974 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975
Victor Stinner62ec3312016-09-06 17:04:34 -07005976 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005977 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005980 case '\n': continue;
5981 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5982 case '\'': WRITE_ASCII_CHAR('\''); continue;
5983 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5984 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5987 case 't': WRITE_ASCII_CHAR('\t'); continue;
5988 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5989 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005992 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 case '0': case '1': case '2': case '3':
5997 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005999 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006000 ch = (ch<<3) + *s++ - '0';
6001 if (s < end && '0' <= *s && *s <= '7') {
6002 ch = (ch<<3) + *s++ - '0';
6003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 WRITE_CHAR(ch);
6006 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 /* hex escapes */
6009 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006012 message = "truncated \\xXX escape";
6013 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006018 message = "truncated \\uXXXX escape";
6019 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006022 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006023 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006024 message = "truncated \\UXXXXXXXX escape";
6025 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006027 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 ch <<= 4;
6029 if (c >= '0' && c <= '9') {
6030 ch += c - '0';
6031 }
6032 else if (c >= 'a' && c <= 'f') {
6033 ch += c - ('a' - 10);
6034 }
6035 else if (c >= 'A' && c <= 'F') {
6036 ch += c - ('A' - 10);
6037 }
6038 else {
6039 break;
6040 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006042 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006043 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006044 }
6045
6046 /* when we get here, ch is a 32-bit unicode character */
6047 if (ch > MAX_UNICODE) {
6048 message = "illegal Unicode character";
6049 goto error;
6050 }
6051
6052 WRITE_CHAR(ch);
6053 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006056 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 if (ucnhash_CAPI == NULL) {
6058 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6060 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006061 if (ucnhash_CAPI == NULL) {
6062 PyErr_SetString(
6063 PyExc_UnicodeError,
6064 "\\N escapes not supported (can't load unicodedata module)"
6065 );
6066 goto onError;
6067 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006069
6070 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 const char *start = ++s;
6073 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 namelen = s - start;
6078 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006079 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006080 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006081 ch = 0xffffffff; /* in case 'getcode' messes up */
6082 if (namelen <= INT_MAX &&
6083 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6084 &ch, 0)) {
6085 assert(ch <= MAX_UNICODE);
6086 WRITE_CHAR(ch);
6087 continue;
6088 }
6089 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006090 }
6091 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006092 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006093
6094 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006095 if (*first_invalid_escape == NULL) {
6096 *first_invalid_escape = s-1; /* Back up one char, since we've
6097 already incremented s. */
6098 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006099 WRITE_ASCII_CHAR('\\');
6100 WRITE_CHAR(c);
6101 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006103
6104 error:
6105 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006106 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006107 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006108 errors, &errorHandler,
6109 "unicodeescape", message,
6110 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006111 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006112 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006113 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006114 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006115
6116#undef WRITE_ASCII_CHAR
6117#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006119
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129}
6130
Eric V. Smith42454af2016-10-31 09:22:08 -04006131PyObject *
6132PyUnicode_DecodeUnicodeEscape(const char *s,
6133 Py_ssize_t size,
6134 const char *errors)
6135{
6136 const char *first_invalid_escape;
6137 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6138 &first_invalid_escape);
6139 if (result == NULL)
6140 return NULL;
6141 if (first_invalid_escape != NULL) {
6142 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6143 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006144 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006145 Py_DECREF(result);
6146 return NULL;
6147 }
6148 }
6149 return result;
6150}
6151
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006152/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153
Alexander Belopolsky40018472011-02-26 01:02:56 +00006154PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Ezio Melottie7f90372012-10-05 03:33:31 +03006164 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006165 escape.
6166
Ezio Melottie7f90372012-10-05 03:33:31 +03006167 For UCS1 strings it's '\xxx', 4 bytes per source character.
6168 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6169 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006170 */
6171
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 if (!PyUnicode_Check(unicode)) {
6173 PyErr_BadArgument();
6174 return NULL;
6175 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006178 }
Victor Stinner358af132015-10-12 22:36:57 +02006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 if (len == 0) {
6182 return PyBytes_FromStringAndSize(NULL, 0);
6183 }
6184
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 kind = PyUnicode_KIND(unicode);
6186 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6188 bytes, and 1 byte characters 4. */
6189 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006190 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 return PyErr_NoMemory();
6192 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006193 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 if (repr == NULL) {
6195 return NULL;
6196 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006197
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006199 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006200 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006201
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 /* U+0000-U+00ff range */
6203 if (ch < 0x100) {
6204 if (ch >= ' ' && ch < 127) {
6205 if (ch != '\\') {
6206 /* Copy printable US ASCII as-is */
6207 *p++ = (char) ch;
6208 }
6209 /* Escape backslashes */
6210 else {
6211 *p++ = '\\';
6212 *p++ = '\\';
6213 }
6214 }
Victor Stinner358af132015-10-12 22:36:57 +02006215
Victor Stinner62ec3312016-09-06 17:04:34 -07006216 /* Map special whitespace to '\t', \n', '\r' */
6217 else if (ch == '\t') {
6218 *p++ = '\\';
6219 *p++ = 't';
6220 }
6221 else if (ch == '\n') {
6222 *p++ = '\\';
6223 *p++ = 'n';
6224 }
6225 else if (ch == '\r') {
6226 *p++ = '\\';
6227 *p++ = 'r';
6228 }
6229
6230 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6231 else {
6232 *p++ = '\\';
6233 *p++ = 'x';
6234 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6235 *p++ = Py_hexdigits[ch & 0x000F];
6236 }
Tim Petersced69f82003-09-16 20:30:58 +00006237 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006238 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 *p++ = '\\';
6241 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006242 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6243 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6244 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6245 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6248 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006249
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 /* Make sure that the first two digits are zero */
6251 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006252 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 *p++ = 'U';
6254 *p++ = '0';
6255 *p++ = '0';
6256 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6261 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 assert(p - PyBytes_AS_STRING(repr) > 0);
6266 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6267 return NULL;
6268 }
6269 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270}
6271
Alexander Belopolsky40018472011-02-26 01:02:56 +00006272PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006273PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6274 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006276 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006277 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 }
6281
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006282 result = PyUnicode_AsUnicodeEscapeString(tmp);
6283 Py_DECREF(tmp);
6284 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285}
6286
6287/* --- Raw Unicode Escape Codec ------------------------------------------- */
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
6290PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006291 Py_ssize_t size,
6292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006295 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 PyObject *errorHandler = NULL;
6298 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006299
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006301 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 /* Escaped strings will always be longer than the resulting
6305 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006306 length after conversion to the true value. (But decoding error
6307 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006308 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 writer.min_length = size;
6310 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6311 goto onError;
6312 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006313
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 end = s + size;
6315 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006316 unsigned char c = (unsigned char) *s++;
6317 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006318 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 Py_ssize_t startinpos;
6320 Py_ssize_t endinpos;
6321 const char *message;
6322
6323#define WRITE_CHAR(ch) \
6324 do { \
6325 if (ch <= writer.maxchar) { \
6326 assert(writer.pos < writer.size); \
6327 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6328 } \
6329 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6330 goto onError; \
6331 } \
6332 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 if (c != '\\' || s >= end) {
6336 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006339
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 c = (unsigned char) *s++;
6341 if (c == 'u') {
6342 count = 4;
6343 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 else if (c == 'U') {
6346 count = 8;
6347 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006348 }
6349 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 assert(writer.pos < writer.size);
6351 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6352 WRITE_CHAR(c);
6353 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006354 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 startinpos = s - starts - 2;
6356
6357 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6358 for (ch = 0; count && s < end; ++s, --count) {
6359 c = (unsigned char)*s;
6360 ch <<= 4;
6361 if (c >= '0' && c <= '9') {
6362 ch += c - '0';
6363 }
6364 else if (c >= 'a' && c <= 'f') {
6365 ch += c - ('a' - 10);
6366 }
6367 else if (c >= 'A' && c <= 'F') {
6368 ch += c - ('A' - 10);
6369 }
6370 else {
6371 break;
6372 }
6373 }
6374 if (!count) {
6375 if (ch <= MAX_UNICODE) {
6376 WRITE_CHAR(ch);
6377 continue;
6378 }
6379 message = "\\Uxxxxxxxx out of range";
6380 }
6381
6382 endinpos = s-starts;
6383 writer.min_length = end - s + writer.pos;
6384 if (unicode_decode_call_errorhandler_writer(
6385 errors, &errorHandler,
6386 "rawunicodeescape", message,
6387 &starts, &end, &startinpos, &endinpos, &exc, &s,
6388 &writer)) {
6389 goto onError;
6390 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006391 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006392
6393#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 Py_XDECREF(errorHandler);
6396 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006398
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006400 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 Py_XDECREF(errorHandler);
6402 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006404
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405}
6406
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407
Alexander Belopolsky40018472011-02-26 01:02:56 +00006408PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006409PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414 int kind;
6415 void *data;
6416 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418 if (!PyUnicode_Check(unicode)) {
6419 PyErr_BadArgument();
6420 return NULL;
6421 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 kind = PyUnicode_KIND(unicode);
6426 data = PyUnicode_DATA(unicode);
6427 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 if (kind == PyUnicode_1BYTE_KIND) {
6429 return PyBytes_FromStringAndSize(data, len);
6430 }
Victor Stinner0e368262011-11-10 20:12:49 +01006431
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6433 bytes, and 1 byte characters 4. */
6434 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006435
Victor Stinner62ec3312016-09-06 17:04:34 -07006436 if (len > PY_SSIZE_T_MAX / expandsize) {
6437 return PyErr_NoMemory();
6438 }
6439 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6440 if (repr == NULL) {
6441 return NULL;
6442 }
6443 if (len == 0) {
6444 return repr;
6445 }
6446
6447 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006448 for (pos = 0; pos < len; pos++) {
6449 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006450
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6452 if (ch < 0x100) {
6453 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006454 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006455 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 *p++ = '\\';
6458 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006459 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6462 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6465 else {
6466 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6467 *p++ = '\\';
6468 *p++ = 'U';
6469 *p++ = '0';
6470 *p++ = '0';
6471 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6476 *p++ = Py_hexdigits[ch & 15];
6477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006479
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 assert(p > PyBytes_AS_STRING(repr));
6481 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6482 return NULL;
6483 }
6484 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485}
6486
Alexander Belopolsky40018472011-02-26 01:02:56 +00006487PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6489 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006492 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006494 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006495 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6496 Py_DECREF(tmp);
6497 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006500/* --- Unicode Internal Codec ------------------------------------------- */
6501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502PyObject *
6503_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006504 Py_ssize_t size,
6505 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006506{
6507 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 Py_ssize_t startinpos;
6509 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006510 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006511 const char *end;
6512 const char *reason;
6513 PyObject *errorHandler = NULL;
6514 PyObject *exc = NULL;
6515
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006517 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006518 1))
6519 return NULL;
6520
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006521 if (size < 0) {
6522 PyErr_BadInternalCall();
6523 return NULL;
6524 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006525 if (size == 0)
6526 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006527
Victor Stinner8f674cc2013-04-17 23:02:17 +02006528 _PyUnicodeWriter_Init(&writer);
6529 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6530 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006532 }
6533 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534
Victor Stinner8f674cc2013-04-17 23:02:17 +02006535 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006536 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006537 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006538 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006539 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006540 endinpos = end-starts;
6541 reason = "truncated input";
6542 goto error;
6543 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006544 /* We copy the raw representation one byte at a time because the
6545 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 ((char *) &uch)[0] = s[0];
6547 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006548#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 ((char *) &uch)[2] = s[2];
6550 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006551#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006552 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006553#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554 /* We have to sanity check the raw data, otherwise doom looms for
6555 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006556 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006557 endinpos = s - starts + Py_UNICODE_SIZE;
6558 reason = "illegal code point (> 0x10FFFF)";
6559 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006560 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006561#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 s += Py_UNICODE_SIZE;
6563#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006564 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006565 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006566 Py_UNICODE uch2;
6567 ((char *) &uch2)[0] = s[0];
6568 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006569 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 {
Victor Stinner551ac952011-11-29 22:58:13 +01006571 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006573 }
6574 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575#endif
6576
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006577 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006578 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006579 continue;
6580
6581 error:
6582 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006583 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006584 errors, &errorHandler,
6585 "unicode_internal", reason,
6586 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006587 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006588 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006589 }
6590
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006591 Py_XDECREF(errorHandler);
6592 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006593 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006596 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006597 Py_XDECREF(errorHandler);
6598 Py_XDECREF(exc);
6599 return NULL;
6600}
6601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602/* --- Latin-1 Codec ------------------------------------------------------ */
6603
Alexander Belopolsky40018472011-02-26 01:02:56 +00006604PyObject *
6605PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006606 Py_ssize_t size,
6607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006610 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611}
6612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006614static void
6615make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006616 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006617 PyObject *unicode,
6618 Py_ssize_t startpos, Py_ssize_t endpos,
6619 const char *reason)
6620{
6621 if (*exceptionObject == NULL) {
6622 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006624 encoding, unicode, startpos, endpos, reason);
6625 }
6626 else {
6627 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6628 goto onError;
6629 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6630 goto onError;
6631 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6632 goto onError;
6633 return;
6634 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006635 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006636 }
6637}
6638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static void
6641raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006642 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006643 PyObject *unicode,
6644 Py_ssize_t startpos, Py_ssize_t endpos,
6645 const char *reason)
6646{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006647 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006648 encoding, unicode, startpos, endpos, reason);
6649 if (*exceptionObject != NULL)
6650 PyCodec_StrictErrors(*exceptionObject);
6651}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652
6653/* error handling callback helper:
6654 build arguments, call the callback and check the arguments,
6655 put the result into newpos and return the replacement string, which
6656 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657static PyObject *
6658unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 PyObject **errorHandler,
6660 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006662 Py_ssize_t startpos, Py_ssize_t endpos,
6663 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006665 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 PyObject *restuple;
6668 PyObject *resunicode;
6669
6670 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 }
6675
Benjamin Petersonbac79492012-01-14 13:34:47 -05006676 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 return NULL;
6678 len = PyUnicode_GET_LENGTH(unicode);
6679
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006680 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006685 restuple = PyObject_CallFunctionObjArgs(
6686 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006690 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 Py_DECREF(restuple);
6692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006694 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 &resunicode, newpos)) {
6696 Py_DECREF(restuple);
6697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006699 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6700 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6701 Py_DECREF(restuple);
6702 return NULL;
6703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006705 *newpos = len + *newpos;
6706 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006707 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 Py_DECREF(restuple);
6709 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 Py_INCREF(resunicode);
6712 Py_DECREF(restuple);
6713 return resunicode;
6714}
6715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006718 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006719 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006721 /* input state */
6722 Py_ssize_t pos=0, size;
6723 int kind;
6724 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 /* pointer into the output */
6726 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006727 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6728 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006729 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006731 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006732 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006733 /* output object */
6734 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735
Benjamin Petersonbac79492012-01-14 13:34:47 -05006736 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006737 return NULL;
6738 size = PyUnicode_GET_LENGTH(unicode);
6739 kind = PyUnicode_KIND(unicode);
6740 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 /* allocate enough for a simple encoding without
6742 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006743 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006744 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006745
6746 _PyBytesWriter_Init(&writer);
6747 str = _PyBytesWriter_Alloc(&writer, size);
6748 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006749 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006752 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006753
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006755 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006757 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006758 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006759 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006761 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006764 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006766
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006767 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006769
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006770 /* Only overallocate the buffer if it's not the last write */
6771 writer.overallocate = (collend < size);
6772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006774 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006775 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006776
6777 switch (error_handler) {
6778 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006779 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006781
6782 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006783 memset(str, '?', collend - collstart);
6784 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006785 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006786 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 break;
Victor Stinner50149202015-09-22 00:26:54 +02006789
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006790 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006791 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006792 writer.min_size -= (collend - collstart);
6793 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006794 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006795 if (str == NULL)
6796 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006797 pos = collend;
6798 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006799
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006800 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006801 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006802 writer.min_size -= (collend - collstart);
6803 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006804 unicode, collstart, collend);
6805 if (str == NULL)
6806 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006807 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 break;
Victor Stinner50149202015-09-22 00:26:54 +02006809
Victor Stinnerc3713e92015-09-29 12:32:13 +02006810 case _Py_ERROR_SURROGATEESCAPE:
6811 for (i = collstart; i < collend; ++i) {
6812 ch = PyUnicode_READ(kind, data, i);
6813 if (ch < 0xdc80 || 0xdcff < ch) {
6814 /* Not a UTF-8b surrogate */
6815 break;
6816 }
6817 *str++ = (char)(ch - 0xdc00);
6818 ++pos;
6819 }
6820 if (i >= collend)
6821 break;
6822 collstart = pos;
6823 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006824 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006825
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006827 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6828 encoding, reason, unicode, &exc,
6829 collstart, collend, &newpos);
6830 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006832
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006833 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006834 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006835
Victor Stinner6bd525b2015-10-09 13:10:05 +02006836 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006837 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006838 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006839 PyBytes_AS_STRING(rep),
6840 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006841 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 else {
6843 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006844
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006848 if (limit == 256 ?
6849 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6850 !PyUnicode_IS_ASCII(rep))
6851 {
6852 /* Not all characters are smaller than limit */
6853 raise_encode_exception(&exc, encoding, unicode,
6854 collstart, collend, reason);
6855 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006857 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6858 str = _PyBytesWriter_WriteBytes(&writer, str,
6859 PyUnicode_DATA(rep),
6860 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006862 if (str == NULL)
6863 goto onError;
6864
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006865 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006866 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006868
6869 /* If overallocation was disabled, ensure that it was the last
6870 write. Otherwise, we missed an optimization */
6871 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872 }
6873 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874
Victor Stinner50149202015-09-22 00:26:54 +02006875 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006877 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006878
6879 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006880 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006881 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006882 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006883 Py_XDECREF(exc);
6884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885}
6886
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006888PyObject *
6889PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006890 Py_ssize_t size,
6891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006894 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006895 if (unicode == NULL)
6896 return NULL;
6897 result = unicode_encode_ucs1(unicode, errors, 256);
6898 Py_DECREF(unicode);
6899 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006903_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904{
6905 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 PyErr_BadArgument();
6907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006909 if (PyUnicode_READY(unicode) == -1)
6910 return NULL;
6911 /* Fast path: if it is a one-byte string, construct
6912 bytes object directly. */
6913 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6914 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6915 PyUnicode_GET_LENGTH(unicode));
6916 /* Non-Latin-1 characters present. Defer to above function to
6917 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006918 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006919}
6920
6921PyObject*
6922PyUnicode_AsLatin1String(PyObject *unicode)
6923{
6924 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
6927/* --- 7-bit ASCII Codec -------------------------------------------------- */
6928
Alexander Belopolsky40018472011-02-26 01:02:56 +00006929PyObject *
6930PyUnicode_DecodeASCII(const char *s,
6931 Py_ssize_t size,
6932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006935 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006936 int kind;
6937 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006938 Py_ssize_t startinpos;
6939 Py_ssize_t endinpos;
6940 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006942 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006944 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006947 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006950 if (size == 1 && (unsigned char)s[0] < 128)
6951 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006952
Victor Stinner8f674cc2013-04-17 23:02:17 +02006953 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006954 writer.min_length = size;
6955 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006956 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 writer.pos = outpos;
6962 if (writer.pos == size)
6963 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006964
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 s += writer.pos;
6966 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006968 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 PyUnicode_WRITE(kind, data, writer.pos, c);
6971 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006975
6976 /* byte outsize range 0x00..0x7f: call the error handler */
6977
6978 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006979 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006980
6981 switch (error_handler)
6982 {
6983 case _Py_ERROR_REPLACE:
6984 case _Py_ERROR_SURROGATEESCAPE:
6985 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006986 but we may switch to UCS2 at the first write */
6987 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6988 goto onError;
6989 kind = writer.kind;
6990 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991
6992 if (error_handler == _Py_ERROR_REPLACE)
6993 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6994 else
6995 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6996 writer.pos++;
6997 ++s;
6998 break;
6999
7000 case _Py_ERROR_IGNORE:
7001 ++s;
7002 break;
7003
7004 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 startinpos = s-starts;
7006 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007008 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 "ascii", "ordinal not in range(128)",
7010 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007011 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 kind = writer.kind;
7014 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007017 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007018 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007022 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 return NULL;
7026}
7027
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029PyObject *
7030PyUnicode_EncodeASCII(const Py_UNICODE *p,
7031 Py_ssize_t size,
7032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007035 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007036 if (unicode == NULL)
7037 return NULL;
7038 result = unicode_encode_ucs1(unicode, errors, 128);
7039 Py_DECREF(unicode);
7040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
Alexander Belopolsky40018472011-02-26 01:02:56 +00007043PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045{
7046 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyErr_BadArgument();
7048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050 if (PyUnicode_READY(unicode) == -1)
7051 return NULL;
7052 /* Fast path: if it is an ASCII-only string, construct bytes object
7053 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007054 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7056 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007058}
7059
7060PyObject *
7061PyUnicode_AsASCIIString(PyObject *unicode)
7062{
7063 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Steve Dowercc16be82016-09-08 10:35:16 -07007066#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007067
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007068/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007069
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007070#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071#define NEED_RETRY
7072#endif
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074#ifndef WC_ERR_INVALID_CHARS
7075# define WC_ERR_INVALID_CHARS 0x0080
7076#endif
7077
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007078static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007079code_page_name(UINT code_page, PyObject **obj)
7080{
7081 *obj = NULL;
7082 if (code_page == CP_ACP)
7083 return "mbcs";
7084 if (code_page == CP_UTF7)
7085 return "CP_UTF7";
7086 if (code_page == CP_UTF8)
7087 return "CP_UTF8";
7088
7089 *obj = PyBytes_FromFormat("cp%u", code_page);
7090 if (*obj == NULL)
7091 return NULL;
7092 return PyBytes_AS_STRING(*obj);
7093}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094
Victor Stinner3a50e702011-10-18 21:21:00 +02007095static DWORD
7096decode_code_page_flags(UINT code_page)
7097{
7098 if (code_page == CP_UTF7) {
7099 /* The CP_UTF7 decoder only supports flags=0 */
7100 return 0;
7101 }
7102 else
7103 return MB_ERR_INVALID_CHARS;
7104}
7105
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 * Decode a byte string from a Windows code page into unicode object in strict
7108 * mode.
7109 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007110 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7111 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007114decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007115 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 const char *in,
7117 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118{
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007120 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
7123 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 assert(insize > 0);
7125 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7126 if (outsize <= 0)
7127 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128
7129 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007131 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007132 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 if (*v == NULL)
7134 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136 }
7137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007140 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 }
7144
7145 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7147 if (outsize <= 0)
7148 goto error;
7149 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007150
Victor Stinner3a50e702011-10-18 21:21:00 +02007151error:
7152 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7153 return -2;
7154 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007155 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156}
7157
Victor Stinner3a50e702011-10-18 21:21:00 +02007158/*
7159 * Decode a byte string from a code page into unicode object with an error
7160 * handler.
7161 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007162 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 * UnicodeDecodeError exception and returns -1 on error.
7164 */
7165static int
7166decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007167 PyObject **v,
7168 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007169 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007170{
7171 const char *startin = in;
7172 const char *endin = in + size;
7173 const DWORD flags = decode_code_page_flags(code_page);
7174 /* Ideally, we should get reason from FormatMessage. This is the Windows
7175 2000 English version of the message. */
7176 const char *reason = "No mapping for the Unicode character exists "
7177 "in the target code page.";
7178 /* each step cannot decode more than 1 character, but a character can be
7179 represented as a surrogate pair */
7180 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007181 int insize;
7182 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 PyObject *errorHandler = NULL;
7184 PyObject *exc = NULL;
7185 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007186 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 DWORD err;
7188 int ret = -1;
7189
7190 assert(size > 0);
7191
7192 encoding = code_page_name(code_page, &encoding_obj);
7193 if (encoding == NULL)
7194 return -1;
7195
Victor Stinner7d00cc12014-03-17 23:08:06 +01007196 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7198 UnicodeDecodeError. */
7199 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7200 if (exc != NULL) {
7201 PyCodec_StrictErrors(exc);
7202 Py_CLEAR(exc);
7203 }
7204 goto error;
7205 }
7206
7207 if (*v == NULL) {
7208 /* Create unicode object */
7209 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7210 PyErr_NoMemory();
7211 goto error;
7212 }
Victor Stinnerab595942011-12-17 04:59:06 +01007213 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 if (*v == NULL)
7216 goto error;
7217 startout = PyUnicode_AS_UNICODE(*v);
7218 }
7219 else {
7220 /* Extend unicode object */
7221 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7222 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7223 PyErr_NoMemory();
7224 goto error;
7225 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007226 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 goto error;
7228 startout = PyUnicode_AS_UNICODE(*v) + n;
7229 }
7230
7231 /* Decode the byte string character per character */
7232 out = startout;
7233 while (in < endin)
7234 {
7235 /* Decode a character */
7236 insize = 1;
7237 do
7238 {
7239 outsize = MultiByteToWideChar(code_page, flags,
7240 in, insize,
7241 buffer, Py_ARRAY_LENGTH(buffer));
7242 if (outsize > 0)
7243 break;
7244 err = GetLastError();
7245 if (err != ERROR_NO_UNICODE_TRANSLATION
7246 && err != ERROR_INSUFFICIENT_BUFFER)
7247 {
7248 PyErr_SetFromWindowsErr(0);
7249 goto error;
7250 }
7251 insize++;
7252 }
7253 /* 4=maximum length of a UTF-8 sequence */
7254 while (insize <= 4 && (in + insize) <= endin);
7255
7256 if (outsize <= 0) {
7257 Py_ssize_t startinpos, endinpos, outpos;
7258
Victor Stinner7d00cc12014-03-17 23:08:06 +01007259 /* last character in partial decode? */
7260 if (in + insize >= endin && !final)
7261 break;
7262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 startinpos = in - startin;
7264 endinpos = startinpos + 1;
7265 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007266 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 errors, &errorHandler,
7268 encoding, reason,
7269 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 {
7272 goto error;
7273 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007274 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 }
7276 else {
7277 in += insize;
7278 memcpy(out, buffer, outsize * sizeof(wchar_t));
7279 out += outsize;
7280 }
7281 }
7282
7283 /* write a NUL character at the end */
7284 *out = 0;
7285
7286 /* Extend unicode object */
7287 outsize = out - startout;
7288 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007289 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007291 /* (in - startin) <= size and size is an int */
7292 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007293
7294error:
7295 Py_XDECREF(encoding_obj);
7296 Py_XDECREF(errorHandler);
7297 Py_XDECREF(exc);
7298 return ret;
7299}
7300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301static PyObject *
7302decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 const char *s, Py_ssize_t size,
7304 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305{
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 PyObject *v = NULL;
7307 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 if (code_page < 0) {
7310 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7311 return NULL;
7312 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007313 if (size < 0) {
7314 PyErr_BadInternalCall();
7315 return NULL;
7316 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007317
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 do
7322 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 if (size > INT_MAX) {
7325 chunk_size = INT_MAX;
7326 final = 0;
7327 done = 0;
7328 }
7329 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 {
7332 chunk_size = (int)size;
7333 final = (consumed == NULL);
7334 done = 1;
7335 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 if (chunk_size == 0 && done) {
7338 if (v != NULL)
7339 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007340 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 converted = decode_code_page_strict(code_page, &v,
7344 s, chunk_size);
7345 if (converted == -2)
7346 converted = decode_code_page_errors(code_page, &v,
7347 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007348 errors, final);
7349 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007350
7351 if (converted < 0) {
7352 Py_XDECREF(v);
7353 return NULL;
7354 }
7355
7356 if (consumed)
7357 *consumed += converted;
7358
7359 s += converted;
7360 size -= converted;
7361 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007362
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007363 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364}
7365
Alexander Belopolsky40018472011-02-26 01:02:56 +00007366PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007367PyUnicode_DecodeCodePageStateful(int code_page,
7368 const char *s,
7369 Py_ssize_t size,
7370 const char *errors,
7371 Py_ssize_t *consumed)
7372{
7373 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7374}
7375
7376PyObject *
7377PyUnicode_DecodeMBCSStateful(const char *s,
7378 Py_ssize_t size,
7379 const char *errors,
7380 Py_ssize_t *consumed)
7381{
7382 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7383}
7384
7385PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007386PyUnicode_DecodeMBCS(const char *s,
7387 Py_ssize_t size,
7388 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007389{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7391}
7392
Victor Stinner3a50e702011-10-18 21:21:00 +02007393static DWORD
7394encode_code_page_flags(UINT code_page, const char *errors)
7395{
7396 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007397 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 }
7399 else if (code_page == CP_UTF7) {
7400 /* CP_UTF7 only supports flags=0 */
7401 return 0;
7402 }
7403 else {
7404 if (errors != NULL && strcmp(errors, "replace") == 0)
7405 return 0;
7406 else
7407 return WC_NO_BEST_FIT_CHARS;
7408 }
7409}
7410
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 * Encode a Unicode string to a Windows code page into a byte string in strict
7413 * mode.
7414 *
7415 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007416 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007418static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007419encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422{
Victor Stinner554f3f02010-06-16 23:33:54 +00007423 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 BOOL *pusedDefaultChar = &usedDefaultChar;
7425 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007426 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 const DWORD flags = encode_code_page_flags(code_page, NULL);
7429 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 /* Create a substring so that we can get the UTF-16 representation
7431 of just the slice under consideration. */
7432 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433
Martin v. Löwis3d325192011-11-04 18:23:06 +01007434 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007435
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007437 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007439 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007440
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 substring = PyUnicode_Substring(unicode, offset, offset+len);
7442 if (substring == NULL)
7443 return -1;
7444 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7445 if (p == NULL) {
7446 Py_DECREF(substring);
7447 return -1;
7448 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007451 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007453 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 NULL, 0,
7455 NULL, pusedDefaultChar);
7456 if (outsize <= 0)
7457 goto error;
7458 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 if (pusedDefaultChar && *pusedDefaultChar) {
7460 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007463
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 if (*outbytes == NULL) {
7468 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472 }
7473 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 const Py_ssize_t n = PyBytes_Size(*outbytes);
7476 if (outsize > PY_SSIZE_T_MAX - n) {
7477 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007478 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7482 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 }
7487
7488 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007490 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 out, outsize,
7492 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 if (outsize <= 0)
7495 goto error;
7496 if (pusedDefaultChar && *pusedDefaultChar)
7497 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7503 return -2;
7504 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007505 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007506}
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007509 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 * error handler.
7511 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007512 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 * -1 on other error.
7514 */
7515static int
7516encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007517 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007518 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007519{
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007521 Py_ssize_t pos = unicode_offset;
7522 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 /* Ideally, we should get reason from FormatMessage. This is the Windows
7524 2000 English version of the message. */
7525 const char *reason = "invalid character";
7526 /* 4=maximum length of a UTF-8 sequence */
7527 char buffer[4];
7528 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7529 Py_ssize_t outsize;
7530 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 PyObject *errorHandler = NULL;
7532 PyObject *exc = NULL;
7533 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007534 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 PyObject *rep;
7537 int ret = -1;
7538
7539 assert(insize > 0);
7540
7541 encoding = code_page_name(code_page, &encoding_obj);
7542 if (encoding == NULL)
7543 return -1;
7544
7545 if (errors == NULL || strcmp(errors, "strict") == 0) {
7546 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7547 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007548 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 if (exc != NULL) {
7550 PyCodec_StrictErrors(exc);
7551 Py_DECREF(exc);
7552 }
7553 Py_XDECREF(encoding_obj);
7554 return -1;
7555 }
7556
7557 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7558 pusedDefaultChar = &usedDefaultChar;
7559 else
7560 pusedDefaultChar = NULL;
7561
7562 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7563 PyErr_NoMemory();
7564 goto error;
7565 }
7566 outsize = insize * Py_ARRAY_LENGTH(buffer);
7567
7568 if (*outbytes == NULL) {
7569 /* Create string object */
7570 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7571 if (*outbytes == NULL)
7572 goto error;
7573 out = PyBytes_AS_STRING(*outbytes);
7574 }
7575 else {
7576 /* Extend string object */
7577 Py_ssize_t n = PyBytes_Size(*outbytes);
7578 if (n > PY_SSIZE_T_MAX - outsize) {
7579 PyErr_NoMemory();
7580 goto error;
7581 }
7582 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7583 goto error;
7584 out = PyBytes_AS_STRING(*outbytes) + n;
7585 }
7586
7587 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7591 wchar_t chars[2];
7592 int charsize;
7593 if (ch < 0x10000) {
7594 chars[0] = (wchar_t)ch;
7595 charsize = 1;
7596 }
7597 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007598 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7599 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007600 charsize = 2;
7601 }
7602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007604 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 buffer, Py_ARRAY_LENGTH(buffer),
7606 NULL, pusedDefaultChar);
7607 if (outsize > 0) {
7608 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7609 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007610 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 memcpy(out, buffer, outsize);
7612 out += outsize;
7613 continue;
7614 }
7615 }
7616 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7617 PyErr_SetFromWindowsErr(0);
7618 goto error;
7619 }
7620
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 rep = unicode_encode_call_errorhandler(
7622 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 if (rep == NULL)
7626 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007628
7629 if (PyBytes_Check(rep)) {
7630 outsize = PyBytes_GET_SIZE(rep);
7631 if (outsize != 1) {
7632 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7633 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7634 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7635 Py_DECREF(rep);
7636 goto error;
7637 }
7638 out = PyBytes_AS_STRING(*outbytes) + offset;
7639 }
7640 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7641 out += outsize;
7642 }
7643 else {
7644 Py_ssize_t i;
7645 enum PyUnicode_Kind kind;
7646 void *data;
7647
Benjamin Petersonbac79492012-01-14 13:34:47 -05007648 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 Py_DECREF(rep);
7650 goto error;
7651 }
7652
7653 outsize = PyUnicode_GET_LENGTH(rep);
7654 if (outsize != 1) {
7655 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7656 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7657 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7658 Py_DECREF(rep);
7659 goto error;
7660 }
7661 out = PyBytes_AS_STRING(*outbytes) + offset;
7662 }
7663 kind = PyUnicode_KIND(rep);
7664 data = PyUnicode_DATA(rep);
7665 for (i=0; i < outsize; i++) {
7666 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7667 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007668 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 encoding, unicode,
7670 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 "unable to encode error handler result to ASCII");
7672 Py_DECREF(rep);
7673 goto error;
7674 }
7675 *out = (unsigned char)ch;
7676 out++;
7677 }
7678 }
7679 Py_DECREF(rep);
7680 }
7681 /* write a NUL byte */
7682 *out = 0;
7683 outsize = out - PyBytes_AS_STRING(*outbytes);
7684 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7685 if (_PyBytes_Resize(outbytes, outsize) < 0)
7686 goto error;
7687 ret = 0;
7688
7689error:
7690 Py_XDECREF(encoding_obj);
7691 Py_XDECREF(errorHandler);
7692 Py_XDECREF(exc);
7693 return ret;
7694}
7695
Victor Stinner3a50e702011-10-18 21:21:00 +02007696static PyObject *
7697encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007698 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 const char *errors)
7700{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007701 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007704 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007705
Victor Stinner29dacf22015-01-26 16:41:32 +01007706 if (!PyUnicode_Check(unicode)) {
7707 PyErr_BadArgument();
7708 return NULL;
7709 }
7710
Benjamin Petersonbac79492012-01-14 13:34:47 -05007711 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007712 return NULL;
7713 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007714
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 if (code_page < 0) {
7716 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7717 return NULL;
7718 }
7719
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 return PyBytes_FromStringAndSize(NULL, 0);
7722
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 offset = 0;
7724 do
7725 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007727 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007728 chunks. */
7729 if (len > INT_MAX/2) {
7730 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 done = 0;
7732 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007734#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 done = 1;
7738 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007739
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 errors);
7743 if (ret == -2)
7744 ret = encode_code_page_errors(code_page, &outbytes,
7745 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 if (ret < 0) {
7748 Py_XDECREF(outbytes);
7749 return NULL;
7750 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007753 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007754 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007755
Victor Stinner3a50e702011-10-18 21:21:00 +02007756 return outbytes;
7757}
7758
7759PyObject *
7760PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7761 Py_ssize_t size,
7762 const char *errors)
7763{
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007765 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007766 if (unicode == NULL)
7767 return NULL;
7768 res = encode_code_page(CP_ACP, unicode, errors);
7769 Py_DECREF(unicode);
7770 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007771}
7772
7773PyObject *
7774PyUnicode_EncodeCodePage(int code_page,
7775 PyObject *unicode,
7776 const char *errors)
7777{
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007779}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007780
Alexander Belopolsky40018472011-02-26 01:02:56 +00007781PyObject *
7782PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007783{
Victor Stinner7581cef2011-11-03 22:32:33 +01007784 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007785}
7786
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007787#undef NEED_RETRY
7788
Steve Dowercc16be82016-09-08 10:35:16 -07007789#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007790
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791/* --- Character Mapping Codec -------------------------------------------- */
7792
Victor Stinnerfb161b12013-04-18 01:44:27 +02007793static int
7794charmap_decode_string(const char *s,
7795 Py_ssize_t size,
7796 PyObject *mapping,
7797 const char *errors,
7798 _PyUnicodeWriter *writer)
7799{
7800 const char *starts = s;
7801 const char *e;
7802 Py_ssize_t startinpos, endinpos;
7803 PyObject *errorHandler = NULL, *exc = NULL;
7804 Py_ssize_t maplen;
7805 enum PyUnicode_Kind mapkind;
7806 void *mapdata;
7807 Py_UCS4 x;
7808 unsigned char ch;
7809
7810 if (PyUnicode_READY(mapping) == -1)
7811 return -1;
7812
7813 maplen = PyUnicode_GET_LENGTH(mapping);
7814 mapdata = PyUnicode_DATA(mapping);
7815 mapkind = PyUnicode_KIND(mapping);
7816
7817 e = s + size;
7818
7819 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7820 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7821 * is disabled in encoding aliases, latin1 is preferred because
7822 * its implementation is faster. */
7823 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7824 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7825 Py_UCS4 maxchar = writer->maxchar;
7826
7827 assert (writer->kind == PyUnicode_1BYTE_KIND);
7828 while (s < e) {
7829 ch = *s;
7830 x = mapdata_ucs1[ch];
7831 if (x > maxchar) {
7832 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7833 goto onError;
7834 maxchar = writer->maxchar;
7835 outdata = (Py_UCS1 *)writer->data;
7836 }
7837 outdata[writer->pos] = x;
7838 writer->pos++;
7839 ++s;
7840 }
7841 return 0;
7842 }
7843
7844 while (s < e) {
7845 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7846 enum PyUnicode_Kind outkind = writer->kind;
7847 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7848 if (outkind == PyUnicode_1BYTE_KIND) {
7849 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7850 Py_UCS4 maxchar = writer->maxchar;
7851 while (s < e) {
7852 ch = *s;
7853 x = mapdata_ucs2[ch];
7854 if (x > maxchar)
7855 goto Error;
7856 outdata[writer->pos] = x;
7857 writer->pos++;
7858 ++s;
7859 }
7860 break;
7861 }
7862 else if (outkind == PyUnicode_2BYTE_KIND) {
7863 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7864 while (s < e) {
7865 ch = *s;
7866 x = mapdata_ucs2[ch];
7867 if (x == 0xFFFE)
7868 goto Error;
7869 outdata[writer->pos] = x;
7870 writer->pos++;
7871 ++s;
7872 }
7873 break;
7874 }
7875 }
7876 ch = *s;
7877
7878 if (ch < maplen)
7879 x = PyUnicode_READ(mapkind, mapdata, ch);
7880 else
7881 x = 0xfffe; /* invalid value */
7882Error:
7883 if (x == 0xfffe)
7884 {
7885 /* undefined mapping */
7886 startinpos = s-starts;
7887 endinpos = startinpos+1;
7888 if (unicode_decode_call_errorhandler_writer(
7889 errors, &errorHandler,
7890 "charmap", "character maps to <undefined>",
7891 &starts, &e, &startinpos, &endinpos, &exc, &s,
7892 writer)) {
7893 goto onError;
7894 }
7895 continue;
7896 }
7897
7898 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7899 goto onError;
7900 ++s;
7901 }
7902 Py_XDECREF(errorHandler);
7903 Py_XDECREF(exc);
7904 return 0;
7905
7906onError:
7907 Py_XDECREF(errorHandler);
7908 Py_XDECREF(exc);
7909 return -1;
7910}
7911
7912static int
7913charmap_decode_mapping(const char *s,
7914 Py_ssize_t size,
7915 PyObject *mapping,
7916 const char *errors,
7917 _PyUnicodeWriter *writer)
7918{
7919 const char *starts = s;
7920 const char *e;
7921 Py_ssize_t startinpos, endinpos;
7922 PyObject *errorHandler = NULL, *exc = NULL;
7923 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007924 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007925
7926 e = s + size;
7927
7928 while (s < e) {
7929 ch = *s;
7930
7931 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7932 key = PyLong_FromLong((long)ch);
7933 if (key == NULL)
7934 goto onError;
7935
7936 item = PyObject_GetItem(mapping, key);
7937 Py_DECREF(key);
7938 if (item == NULL) {
7939 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7940 /* No mapping found means: mapping is undefined. */
7941 PyErr_Clear();
7942 goto Undefined;
7943 } else
7944 goto onError;
7945 }
7946
7947 /* Apply mapping */
7948 if (item == Py_None)
7949 goto Undefined;
7950 if (PyLong_Check(item)) {
7951 long value = PyLong_AS_LONG(item);
7952 if (value == 0xFFFE)
7953 goto Undefined;
7954 if (value < 0 || value > MAX_UNICODE) {
7955 PyErr_Format(PyExc_TypeError,
7956 "character mapping must be in range(0x%lx)",
7957 (unsigned long)MAX_UNICODE + 1);
7958 goto onError;
7959 }
7960
7961 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7962 goto onError;
7963 }
7964 else if (PyUnicode_Check(item)) {
7965 if (PyUnicode_READY(item) == -1)
7966 goto onError;
7967 if (PyUnicode_GET_LENGTH(item) == 1) {
7968 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7969 if (value == 0xFFFE)
7970 goto Undefined;
7971 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7972 goto onError;
7973 }
7974 else {
7975 writer->overallocate = 1;
7976 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7977 goto onError;
7978 }
7979 }
7980 else {
7981 /* wrong return value */
7982 PyErr_SetString(PyExc_TypeError,
7983 "character mapping must return integer, None or str");
7984 goto onError;
7985 }
7986 Py_CLEAR(item);
7987 ++s;
7988 continue;
7989
7990Undefined:
7991 /* undefined mapping */
7992 Py_CLEAR(item);
7993 startinpos = s-starts;
7994 endinpos = startinpos+1;
7995 if (unicode_decode_call_errorhandler_writer(
7996 errors, &errorHandler,
7997 "charmap", "character maps to <undefined>",
7998 &starts, &e, &startinpos, &endinpos, &exc, &s,
7999 writer)) {
8000 goto onError;
8001 }
8002 }
8003 Py_XDECREF(errorHandler);
8004 Py_XDECREF(exc);
8005 return 0;
8006
8007onError:
8008 Py_XDECREF(item);
8009 Py_XDECREF(errorHandler);
8010 Py_XDECREF(exc);
8011 return -1;
8012}
8013
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014PyObject *
8015PyUnicode_DecodeCharmap(const char *s,
8016 Py_ssize_t size,
8017 PyObject *mapping,
8018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008020 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 /* Default to Latin-1 */
8023 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008027 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008028 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008029 writer.min_length = size;
8030 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008032
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008033 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008036 }
8037 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008038 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008041 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008042
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008044 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046}
8047
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048/* Charmap encoding: the lookup table */
8049
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 PyObject_HEAD
8052 unsigned char level1[32];
8053 int count2, count3;
8054 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055};
8056
8057static PyObject*
8058encoding_map_size(PyObject *obj, PyObject* args)
8059{
8060 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008063}
8064
8065static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 PyDoc_STR("Return the size (in bytes) of this object") },
8068 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069};
8070
8071static void
8072encoding_map_dealloc(PyObject* o)
8073{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075}
8076
8077static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 "EncodingMap", /*tp_name*/
8080 sizeof(struct encoding_map), /*tp_basicsize*/
8081 0, /*tp_itemsize*/
8082 /* methods */
8083 encoding_map_dealloc, /*tp_dealloc*/
8084 0, /*tp_print*/
8085 0, /*tp_getattr*/
8086 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008087 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 0, /*tp_repr*/
8089 0, /*tp_as_number*/
8090 0, /*tp_as_sequence*/
8091 0, /*tp_as_mapping*/
8092 0, /*tp_hash*/
8093 0, /*tp_call*/
8094 0, /*tp_str*/
8095 0, /*tp_getattro*/
8096 0, /*tp_setattro*/
8097 0, /*tp_as_buffer*/
8098 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8099 0, /*tp_doc*/
8100 0, /*tp_traverse*/
8101 0, /*tp_clear*/
8102 0, /*tp_richcompare*/
8103 0, /*tp_weaklistoffset*/
8104 0, /*tp_iter*/
8105 0, /*tp_iternext*/
8106 encoding_map_methods, /*tp_methods*/
8107 0, /*tp_members*/
8108 0, /*tp_getset*/
8109 0, /*tp_base*/
8110 0, /*tp_dict*/
8111 0, /*tp_descr_get*/
8112 0, /*tp_descr_set*/
8113 0, /*tp_dictoffset*/
8114 0, /*tp_init*/
8115 0, /*tp_alloc*/
8116 0, /*tp_new*/
8117 0, /*tp_free*/
8118 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119};
8120
8121PyObject*
8122PyUnicode_BuildEncodingMap(PyObject* string)
8123{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 PyObject *result;
8125 struct encoding_map *mresult;
8126 int i;
8127 int need_dict = 0;
8128 unsigned char level1[32];
8129 unsigned char level2[512];
8130 unsigned char *mlevel1, *mlevel2, *mlevel3;
8131 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 int kind;
8133 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008137 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 PyErr_BadArgument();
8139 return NULL;
8140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 kind = PyUnicode_KIND(string);
8142 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008143 length = PyUnicode_GET_LENGTH(string);
8144 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 memset(level1, 0xFF, sizeof level1);
8146 memset(level2, 0xFF, sizeof level2);
8147
8148 /* If there isn't a one-to-one mapping of NULL to \0,
8149 or if there are non-BMP characters, we need to use
8150 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008153 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 ch = PyUnicode_READ(kind, data, i);
8156 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 need_dict = 1;
8158 break;
8159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 /* unmapped character */
8162 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 l1 = ch >> 11;
8164 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 if (level1[l1] == 0xFF)
8166 level1[l1] = count2++;
8167 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 }
8170
8171 if (count2 >= 0xFF || count3 >= 0xFF)
8172 need_dict = 1;
8173
8174 if (need_dict) {
8175 PyObject *result = PyDict_New();
8176 PyObject *key, *value;
8177 if (!result)
8178 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008181 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 if (!key || !value)
8183 goto failed1;
8184 if (PyDict_SetItem(result, key, value) == -1)
8185 goto failed1;
8186 Py_DECREF(key);
8187 Py_DECREF(value);
8188 }
8189 return result;
8190 failed1:
8191 Py_XDECREF(key);
8192 Py_XDECREF(value);
8193 Py_DECREF(result);
8194 return NULL;
8195 }
8196
8197 /* Create a three-level trie */
8198 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8199 16*count2 + 128*count3 - 1);
8200 if (!result)
8201 return PyErr_NoMemory();
8202 PyObject_Init(result, &EncodingMapType);
8203 mresult = (struct encoding_map*)result;
8204 mresult->count2 = count2;
8205 mresult->count3 = count3;
8206 mlevel1 = mresult->level1;
8207 mlevel2 = mresult->level23;
8208 mlevel3 = mresult->level23 + 16*count2;
8209 memcpy(mlevel1, level1, 32);
8210 memset(mlevel2, 0xFF, 16*count2);
8211 memset(mlevel3, 0, 128*count3);
8212 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008213 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8216 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 /* unmapped character */
8218 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008219 o1 = ch>>11;
8220 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i2 = 16*mlevel1[o1] + o2;
8222 if (mlevel2[i2] == 0xFF)
8223 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008224 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225 i3 = 128*mlevel2[i2] + o3;
8226 mlevel3[i3] = i;
8227 }
8228 return result;
8229}
8230
8231static int
Victor Stinner22168992011-11-20 17:09:18 +01008232encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233{
8234 struct encoding_map *map = (struct encoding_map*)mapping;
8235 int l1 = c>>11;
8236 int l2 = (c>>7) & 0xF;
8237 int l3 = c & 0x7F;
8238 int i;
8239
Victor Stinner22168992011-11-20 17:09:18 +01008240 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242 if (c == 0)
8243 return 0;
8244 /* level 1*/
8245 i = map->level1[l1];
8246 if (i == 0xFF) {
8247 return -1;
8248 }
8249 /* level 2*/
8250 i = map->level23[16*i+l2];
8251 if (i == 0xFF) {
8252 return -1;
8253 }
8254 /* level 3 */
8255 i = map->level23[16*map->count2 + 128*i + l3];
8256 if (i == 0) {
8257 return -1;
8258 }
8259 return i;
8260}
8261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262/* Lookup the character ch in the mapping. If the character
8263 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008264 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008265static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008266charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267{
Christian Heimes217cfd12007-12-02 14:31:20 +00008268 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 PyObject *x;
8270
8271 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 x = PyObject_GetItem(mapping, w);
8274 Py_DECREF(w);
8275 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8277 /* No mapping found means: mapping is undefined. */
8278 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008279 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 } else
8281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008283 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008285 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 long value = PyLong_AS_LONG(x);
8287 if (value < 0 || value > 255) {
8288 PyErr_SetString(PyExc_TypeError,
8289 "character mapping must be in range(256)");
8290 Py_DECREF(x);
8291 return NULL;
8292 }
8293 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008295 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 /* wrong return value */
8299 PyErr_Format(PyExc_TypeError,
8300 "character mapping must return integer, bytes or None, not %.400s",
8301 x->ob_type->tp_name);
8302 Py_DECREF(x);
8303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
8305}
8306
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008308charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8311 /* exponentially overallocate to minimize reallocations */
8312 if (requiredsize < 2*outsize)
8313 requiredsize = 2*outsize;
8314 if (_PyBytes_Resize(outobj, requiredsize))
8315 return -1;
8316 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317}
8318
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008323 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 space is available. Return a new reference to the object that
8325 was put in the output buffer, or Py_None, if the mapping was undefined
8326 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008327 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008329charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 PyObject *rep;
8333 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008334 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335
Christian Heimes90aa7642007-12-19 02:45:37 +00008336 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 if (res == -1)
8340 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 if (outsize<requiredsize)
8342 if (charmapencode_resize(outobj, outpos, requiredsize))
8343 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 outstart[(*outpos)++] = (char)res;
8346 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 }
8348
8349 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 Py_DECREF(rep);
8354 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 if (PyLong_Check(rep)) {
8357 Py_ssize_t requiredsize = *outpos+1;
8358 if (outsize<requiredsize)
8359 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8360 Py_DECREF(rep);
8361 return enc_EXCEPTION;
8362 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008363 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 else {
8367 const char *repchars = PyBytes_AS_STRING(rep);
8368 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8369 Py_ssize_t requiredsize = *outpos+repsize;
8370 if (outsize<requiredsize)
8371 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8372 Py_DECREF(rep);
8373 return enc_EXCEPTION;
8374 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008375 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 memcpy(outstart + *outpos, repchars, repsize);
8377 *outpos += repsize;
8378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 Py_DECREF(rep);
8381 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382}
8383
8384/* handle an error in PyUnicode_EncodeCharmap
8385 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386static int
8387charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008390 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008391 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392{
8393 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008396 enum PyUnicode_Kind kind;
8397 void *data;
8398 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008400 Py_ssize_t collstartpos = *inpos;
8401 Py_ssize_t collendpos = *inpos+1;
8402 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008403 const char *encoding = "charmap";
8404 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008407 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408
Benjamin Petersonbac79492012-01-14 13:34:47 -05008409 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008410 return -1;
8411 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 /* find all unencodable characters */
8413 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008415 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008417 val = encoding_map_lookup(ch, mapping);
8418 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 break;
8420 ++collendpos;
8421 continue;
8422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008424 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8425 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 if (rep==NULL)
8427 return -1;
8428 else if (rep!=Py_None) {
8429 Py_DECREF(rep);
8430 break;
8431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 }
8435 /* cache callback name lookup
8436 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008437 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008438 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008439
8440 switch (*error_handler) {
8441 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008442 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008444
8445 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 x = charmapencode_output('?', mapping, res, respos);
8448 if (x==enc_EXCEPTION) {
8449 return -1;
8450 }
8451 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008452 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 return -1;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 }
8456 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008457 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 *inpos = collendpos;
8459 break;
Victor Stinner50149202015-09-22 00:26:54 +02008460
8461 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 /* generate replacement (temporarily (mis)uses p) */
8463 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 char buffer[2+29+1+1];
8465 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008466 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 for (cp = buffer; *cp; ++cp) {
8468 x = charmapencode_output(*cp, mapping, res, respos);
8469 if (x==enc_EXCEPTION)
8470 return -1;
8471 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008472 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 return -1;
8474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 }
8476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 *inpos = collendpos;
8478 break;
Victor Stinner50149202015-09-22 00:26:54 +02008479
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 default:
Victor Stinner50149202015-09-22 00:26:54 +02008481 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008482 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008486 if (PyBytes_Check(repunicode)) {
8487 /* Directly copy bytes result to output. */
8488 Py_ssize_t outsize = PyBytes_Size(*res);
8489 Py_ssize_t requiredsize;
8490 repsize = PyBytes_Size(repunicode);
8491 requiredsize = *respos + repsize;
8492 if (requiredsize > outsize)
8493 /* Make room for all additional bytes. */
8494 if (charmapencode_resize(res, respos, requiredsize)) {
8495 Py_DECREF(repunicode);
8496 return -1;
8497 }
8498 memcpy(PyBytes_AsString(*res) + *respos,
8499 PyBytes_AsString(repunicode), repsize);
8500 *respos += repsize;
8501 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008503 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008506 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008507 Py_DECREF(repunicode);
8508 return -1;
8509 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008510 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008511 data = PyUnicode_DATA(repunicode);
8512 kind = PyUnicode_KIND(repunicode);
8513 for (index = 0; index < repsize; index++) {
8514 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8515 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008517 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
8519 }
8520 else if (x==enc_FAILED) {
8521 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008522 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
8524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 }
8526 *inpos = newpos;
8527 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 }
8529 return 0;
8530}
8531
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533_PyUnicode_EncodeCharmap(PyObject *unicode,
8534 PyObject *mapping,
8535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 /* output object */
8538 PyObject *res = NULL;
8539 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008544 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008546 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008547 void *data;
8548 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Benjamin Petersonbac79492012-01-14 13:34:47 -05008550 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551 return NULL;
8552 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008553 data = PyUnicode_DATA(unicode);
8554 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 /* Default to Latin-1 */
8557 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* allocate enough for a simple encoding without
8561 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008562 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (res == NULL)
8564 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008565 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008569 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 if (x==enc_EXCEPTION) /* error */
8573 goto onError;
8574 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008575 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008577 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 &res, &respos)) {
8579 goto onError;
8580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 else
8583 /* done with this character => adjust input position */
8584 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008588 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008589 if (_PyBytes_Resize(&res, respos) < 0)
8590 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008593 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 return res;
8595
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 Py_XDECREF(res);
8598 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008599 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 return NULL;
8601}
8602
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603/* Deprecated */
8604PyObject *
8605PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8606 Py_ssize_t size,
8607 PyObject *mapping,
8608 const char *errors)
8609{
8610 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008611 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 if (unicode == NULL)
8613 return NULL;
8614 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8615 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008616 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617}
8618
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619PyObject *
8620PyUnicode_AsCharmapString(PyObject *unicode,
8621 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
8623 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 PyErr_BadArgument();
8625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008627 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628}
8629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631static void
8632make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008634 Py_ssize_t startpos, Py_ssize_t endpos,
8635 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 *exceptionObject = _PyUnicodeTranslateError_Create(
8639 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 }
8641 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8643 goto onError;
8644 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8645 goto onError;
8646 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8647 goto onError;
8648 return;
8649 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008650 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
8652}
8653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654/* error handling callback helper:
8655 build arguments, call the callback and check the arguments,
8656 put the result into newpos and return the replacement string, which
8657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
8659unicode_translate_call_errorhandler(const char *errors,
8660 PyObject **errorHandler,
8661 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663 Py_ssize_t startpos, Py_ssize_t endpos,
8664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008666 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008668 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *restuple;
8670 PyObject *resunicode;
8671
8672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677
8678 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008683 restuple = PyObject_CallFunctionObjArgs(
8684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008688 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008692 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 &resunicode, &i_newpos)) {
8694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 else
8700 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008702 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 Py_DECREF(restuple);
8704 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 Py_INCREF(resunicode);
8707 Py_DECREF(restuple);
8708 return resunicode;
8709}
8710
8711/* Lookup the character ch in the mapping and put the result in result,
8712 which must be decrefed by the caller.
8713 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716{
Christian Heimes217cfd12007-12-02 14:31:20 +00008717 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 PyObject *x;
8719
8720 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 x = PyObject_GetItem(mapping, w);
8723 Py_DECREF(w);
8724 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8726 /* No mapping found means: use 1:1 mapping. */
8727 PyErr_Clear();
8728 *result = NULL;
8729 return 0;
8730 } else
8731 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
8733 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 *result = x;
8735 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008737 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008739 if (value < 0 || value > MAX_UNICODE) {
8740 PyErr_Format(PyExc_ValueError,
8741 "character mapping must be in range(0x%x)",
8742 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_DECREF(x);
8744 return -1;
8745 }
8746 *result = x;
8747 return 0;
8748 }
8749 else if (PyUnicode_Check(x)) {
8750 *result = x;
8751 return 0;
8752 }
8753 else {
8754 /* wrong return value */
8755 PyErr_SetString(PyExc_TypeError,
8756 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 Py_DECREF(x);
8758 return -1;
8759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760}
Victor Stinner1194ea02014-04-04 19:37:40 +02008761
8762/* lookup the character, write the result into the writer.
8763 Return 1 if the result was written into the writer, return 0 if the mapping
8764 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008765static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008766charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8767 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 PyObject *item;
8770
8771 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008773
8774 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008776 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008779 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008781
8782 if (item == Py_None) {
8783 Py_DECREF(item);
8784 return 0;
8785 }
8786
8787 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008788 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8789 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8790 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008791 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8792 Py_DECREF(item);
8793 return -1;
8794 }
8795 Py_DECREF(item);
8796 return 1;
8797 }
8798
8799 if (!PyUnicode_Check(item)) {
8800 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008802 }
8803
8804 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8805 Py_DECREF(item);
8806 return -1;
8807 }
8808
8809 Py_DECREF(item);
8810 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811}
8812
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813static int
8814unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8815 Py_UCS1 *translate)
8816{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 int ret = 0;
8819
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 if (charmaptranslate_lookup(ch, mapping, &item)) {
8821 return -1;
8822 }
8823
8824 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008826 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829 /* not found => default to 1:1 mapping */
8830 translate[ch] = ch;
8831 return 1;
8832 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008833 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008834 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008835 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8836 used it */
8837 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838 /* invalid character or character outside ASCII:
8839 skip the fast translate */
8840 goto exit;
8841 }
8842 translate[ch] = (Py_UCS1)replace;
8843 }
8844 else if (PyUnicode_Check(item)) {
8845 Py_UCS4 replace;
8846
8847 if (PyUnicode_READY(item) == -1) {
8848 Py_DECREF(item);
8849 return -1;
8850 }
8851 if (PyUnicode_GET_LENGTH(item) != 1)
8852 goto exit;
8853
8854 replace = PyUnicode_READ_CHAR(item, 0);
8855 if (replace > 127)
8856 goto exit;
8857 translate[ch] = (Py_UCS1)replace;
8858 }
8859 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 goto exit;
8862 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 ret = 1;
8864
Benjamin Peterson1365de72014-04-07 20:15:41 -04008865 exit:
8866 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 return ret;
8868}
8869
8870/* Fast path for ascii => ascii translation. Return 1 if the whole string
8871 was translated into writer, return 0 if the input string was partially
8872 translated into writer, raise an exception and return -1 on error. */
8873static int
8874unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008875 _PyUnicodeWriter *writer, int ignore,
8876 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877{
Victor Stinner872b2912014-04-05 14:27:07 +02008878 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 Py_ssize_t len;
8880 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008881 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883 len = PyUnicode_GET_LENGTH(input);
8884
Victor Stinner872b2912014-04-05 14:27:07 +02008885 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886
8887 in = PyUnicode_1BYTE_DATA(input);
8888 end = in + len;
8889
8890 assert(PyUnicode_IS_ASCII(writer->buffer));
8891 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8892 out = PyUnicode_1BYTE_DATA(writer->buffer);
8893
Victor Stinner872b2912014-04-05 14:27:07 +02008894 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008896 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008898 int translate = unicode_fast_translate_lookup(mapping, ch,
8899 ascii_table);
8900 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008902 if (translate == 0)
8903 goto exit;
8904 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008905 }
Victor Stinner872b2912014-04-05 14:27:07 +02008906 if (ch2 == 0xfe) {
8907 if (ignore)
8908 continue;
8909 goto exit;
8910 }
8911 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008913 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 }
Victor Stinner872b2912014-04-05 14:27:07 +02008915 res = 1;
8916
8917exit:
8918 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008919 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008920 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921}
8922
Victor Stinner3222da22015-10-01 22:07:32 +02008923static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924_PyUnicode_TranslateCharmap(PyObject *input,
8925 PyObject *mapping,
8926 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 Py_ssize_t size, i;
8931 int kind;
8932 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 _PyUnicodeWriter writer;
8934 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008935 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936 PyObject *errorHandler = NULL;
8937 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 PyErr_BadArgument();
8943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 if (PyUnicode_READY(input) == -1)
8947 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008948 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind = PyUnicode_KIND(input);
8950 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008952 if (size == 0)
8953 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 /* allocate enough for a simple 1:1 translation without
8956 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008957 _PyUnicodeWriter_Init(&writer);
8958 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960
Victor Stinner872b2912014-04-05 14:27:07 +02008961 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8962
Victor Stinner33798672016-03-01 21:59:58 +01008963 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008965 if (PyUnicode_IS_ASCII(input)) {
8966 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8967 if (res < 0) {
8968 _PyUnicodeWriter_Dealloc(&writer);
8969 return NULL;
8970 }
8971 if (res == 1)
8972 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973 }
Victor Stinner33798672016-03-01 21:59:58 +01008974 else {
8975 i = 0;
8976 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 int translate;
8981 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8982 Py_ssize_t newpos;
8983 /* startpos for collecting untranslatable chars */
8984 Py_ssize_t collstart;
8985 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 ch = PyUnicode_READ(kind, data, i);
8989 translate = charmaptranslate_output(ch, mapping, &writer);
8990 if (translate < 0)
8991 goto onError;
8992
8993 if (translate != 0) {
8994 /* it worked => adjust input pointer */
8995 ++i;
8996 continue;
8997 }
8998
8999 /* untranslatable character */
9000 collstart = i;
9001 collend = i+1;
9002
9003 /* find all untranslatable characters */
9004 while (collend < size) {
9005 PyObject *x;
9006 ch = PyUnicode_READ(kind, data, collend);
9007 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 Py_XDECREF(x);
9010 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 ++collend;
9013 }
9014
9015 if (ignore) {
9016 i = collend;
9017 }
9018 else {
9019 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9020 reason, input, &exc,
9021 collstart, collend, &newpos);
9022 if (repunicode == NULL)
9023 goto onError;
9024 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009027 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009028 Py_DECREF(repunicode);
9029 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 }
9031 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 Py_XDECREF(exc);
9033 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 Py_XDECREF(exc);
9039 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 return NULL;
9041}
9042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043/* Deprecated. Use PyUnicode_Translate instead. */
9044PyObject *
9045PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9046 Py_ssize_t size,
9047 PyObject *mapping,
9048 const char *errors)
9049{
Christian Heimes5f520f42012-09-11 14:03:25 +02009050 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009051 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (!unicode)
9053 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009054 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9055 Py_DECREF(unicode);
9056 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057}
9058
Alexander Belopolsky40018472011-02-26 01:02:56 +00009059PyObject *
9060PyUnicode_Translate(PyObject *str,
9061 PyObject *mapping,
9062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009064 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009065 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009066 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067}
Tim Petersced69f82003-09-16 20:30:58 +00009068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069PyObject *
9070_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9071{
9072 if (!PyUnicode_Check(unicode)) {
9073 PyErr_BadInternalCall();
9074 return NULL;
9075 }
9076 if (PyUnicode_READY(unicode) == -1)
9077 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009078 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 /* If the string is already ASCII, just return the same string */
9080 Py_INCREF(unicode);
9081 return unicode;
9082 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009083
9084 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9085 PyObject *result = PyUnicode_New(len, 127);
9086 if (result == NULL) {
9087 return NULL;
9088 }
9089
9090 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9091 int kind = PyUnicode_KIND(unicode);
9092 const void *data = PyUnicode_DATA(unicode);
9093 Py_ssize_t i;
9094 for (i = 0; i < len; ++i) {
9095 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9096 if (ch < 127) {
9097 out[i] = ch;
9098 }
9099 else if (Py_UNICODE_ISSPACE(ch)) {
9100 out[i] = ' ';
9101 }
9102 else {
9103 int decimal = Py_UNICODE_TODECIMAL(ch);
9104 if (decimal < 0) {
9105 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009106 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009107 _PyUnicode_LENGTH(result) = i + 1;
9108 break;
9109 }
9110 out[i] = '0' + decimal;
9111 }
9112 }
9113
INADA Naoki16dfca42018-07-14 12:06:43 +09009114 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009115 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116}
9117
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118PyObject *
9119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9120 Py_ssize_t length)
9121{
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 Py_UCS4 maxchar;
9125 enum PyUnicode_Kind kind;
9126 void *data;
9127
Victor Stinner99d7ad02012-02-22 13:37:39 +01009128 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009130 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009134 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009135 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 }
9137 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009138
9139 /* Copy to a new string */
9140 decimal = PyUnicode_New(length, maxchar);
9141 if (decimal == NULL)
9142 return decimal;
9143 kind = PyUnicode_KIND(decimal);
9144 data = PyUnicode_DATA(decimal);
9145 /* Iterate over code points */
9146 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009147 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009148 if (ch > 127) {
9149 int decimal = Py_UNICODE_TODECIMAL(ch);
9150 if (decimal >= 0)
9151 ch = '0' + decimal;
9152 }
9153 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009155 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157/* --- Decimal Encoder ---------------------------------------------------- */
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159int
9160PyUnicode_EncodeDecimal(Py_UNICODE *s,
9161 Py_ssize_t length,
9162 char *output,
9163 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009164{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009165 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009166 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009167 enum PyUnicode_Kind kind;
9168 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169
9170 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 PyErr_BadArgument();
9172 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173 }
9174
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009175 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009176 if (unicode == NULL)
9177 return -1;
9178
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009293 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009311 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329{
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009331 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358
Benjamin Petersonead6b532011-12-20 17:23:42 -06009359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
9384 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009385 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009387 if (unicode != NULL && thousands_sep_kind != kind) {
9388 if (thousands_sep_kind < kind)
9389 PyMem_Free(thousands_sep_data);
9390 else
9391 PyMem_Free(data);
9392 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 if (unicode == NULL) {
9394 *maxchar = 127;
9395 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009396 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009397 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
9399 }
9400 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401}
9402
9403
Alexander Belopolsky40018472011-02-26 01:02:56 +00009404Py_ssize_t
9405PyUnicode_Count(PyObject *str,
9406 PyObject *substr,
9407 Py_ssize_t start,
9408 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009410 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 void *buf1 = NULL, *buf2 = NULL;
9413 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009417
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 kind1 = PyUnicode_KIND(str);
9419 kind2 = PyUnicode_KIND(substr);
9420 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 len1 = PyUnicode_GET_LENGTH(str);
9424 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009426 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009429 buf1 = PyUnicode_DATA(str);
9430 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009432 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (!buf2)
9434 goto onError;
9435 }
9436
9437 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009440 result = asciilib_count(
9441 ((Py_UCS1*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 else
9445 result = ucs1lib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 result = ucs2lib_count(
9452 ((Py_UCS2*)buf1) + start, end - start,
9453 buf2, len2, PY_SSIZE_T_MAX
9454 );
9455 break;
9456 case PyUnicode_4BYTE_KIND:
9457 result = ucs4lib_count(
9458 ((Py_UCS4*)buf1) + start, end - start,
9459 buf2, len2, PY_SSIZE_T_MAX
9460 );
9461 break;
9462 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009463 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009466 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 PyMem_Free(buf2);
9468
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Alexander Belopolsky40018472011-02-26 01:02:56 +00009476Py_ssize_t
9477PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009479 Py_ssize_t start,
9480 Py_ssize_t end,
9481 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009485
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009486 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487}
9488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489Py_ssize_t
9490PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9491 Py_ssize_t start, Py_ssize_t end,
9492 int direction)
9493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009495 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 if (PyUnicode_READY(str) == -1)
9497 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009498 len = PyUnicode_GET_LENGTH(str);
9499 ADJUST_INDICES(start, end, len);
9500 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009501 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009503 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9504 kind, end-start, ch, direction);
9505 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009507 else
9508 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009512tailmatch(PyObject *self,
9513 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009514 Py_ssize_t start,
9515 Py_ssize_t end,
9516 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 int kind_self;
9519 int kind_sub;
9520 void *data_self;
9521 void *data_sub;
9522 Py_ssize_t offset;
9523 Py_ssize_t i;
9524 Py_ssize_t end_sub;
9525
9526 if (PyUnicode_READY(self) == -1 ||
9527 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009528 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9531 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009535 if (PyUnicode_GET_LENGTH(substring) == 0)
9536 return 1;
9537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 kind_self = PyUnicode_KIND(self);
9539 data_self = PyUnicode_DATA(self);
9540 kind_sub = PyUnicode_KIND(substring);
9541 data_sub = PyUnicode_DATA(substring);
9542 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9543
9544 if (direction > 0)
9545 offset = end;
9546 else
9547 offset = start;
9548
9549 if (PyUnicode_READ(kind_self, data_self, offset) ==
9550 PyUnicode_READ(kind_sub, data_sub, 0) &&
9551 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9552 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9553 /* If both are of the same kind, memcmp is sufficient */
9554 if (kind_self == kind_sub) {
9555 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009556 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 data_sub,
9558 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009559 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009561 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 else {
9563 /* We do not need to compare 0 and len(substring)-1 because
9564 the if statement above ensured already that they are equal
9565 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 for (i = 1; i < end_sub; ++i) {
9567 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9568 PyUnicode_READ(kind_sub, data_sub, i))
9569 return 0;
9570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 }
9574
9575 return 0;
9576}
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578Py_ssize_t
9579PyUnicode_Tailmatch(PyObject *str,
9580 PyObject *substr,
9581 Py_ssize_t start,
9582 Py_ssize_t end,
9583 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589}
9590
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009591static PyObject *
9592ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009594 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9595 char *resdata, *data = PyUnicode_DATA(self);
9596 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009597
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009598 res = PyUnicode_New(len, 127);
9599 if (res == NULL)
9600 return NULL;
9601 resdata = PyUnicode_DATA(res);
9602 if (lower)
9603 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009605 _Py_bytes_upper(resdata, data, len);
9606 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607}
9608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009612 Py_ssize_t j;
9613 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009614 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009616
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9618
9619 where ! is a negation and \p{xxx} is a character with property xxx.
9620 */
9621 for (j = i - 1; j >= 0; j--) {
9622 c = PyUnicode_READ(kind, data, j);
9623 if (!_PyUnicode_IsCaseIgnorable(c))
9624 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009626 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9627 if (final_sigma) {
9628 for (j = i + 1; j < length; j++) {
9629 c = PyUnicode_READ(kind, data, j);
9630 if (!_PyUnicode_IsCaseIgnorable(c))
9631 break;
9632 }
9633 final_sigma = j == length || !_PyUnicode_IsCased(c);
9634 }
9635 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636}
9637
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638static int
9639lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9640 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 /* Obscure special case. */
9643 if (c == 0x3A3) {
9644 mapped[0] = handle_capital_sigma(kind, data, length, i);
9645 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009647 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650static Py_ssize_t
9651do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 Py_ssize_t i, k = 0;
9654 int n_res, j;
9655 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009656
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 c = PyUnicode_READ(kind, data, 0);
9658 n_res = _PyUnicode_ToUpperFull(c, mapped);
9659 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009660 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 for (i = 1; i < length; i++) {
9664 c = PyUnicode_READ(kind, data, i);
9665 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9666 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009667 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009669 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009670 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672}
9673
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674static Py_ssize_t
9675do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9676 Py_ssize_t i, k = 0;
9677
9678 for (i = 0; i < length; i++) {
9679 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9680 int n_res, j;
9681 if (Py_UNICODE_ISUPPER(c)) {
9682 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9683 }
9684 else if (Py_UNICODE_ISLOWER(c)) {
9685 n_res = _PyUnicode_ToUpperFull(c, mapped);
9686 }
9687 else {
9688 n_res = 1;
9689 mapped[0] = c;
9690 }
9691 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009692 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 res[k++] = mapped[j];
9694 }
9695 }
9696 return k;
9697}
9698
9699static Py_ssize_t
9700do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9701 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 Py_ssize_t i, k = 0;
9704
9705 for (i = 0; i < length; i++) {
9706 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9707 int n_res, j;
9708 if (lower)
9709 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9710 else
9711 n_res = _PyUnicode_ToUpperFull(c, mapped);
9712 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009713 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 res[k++] = mapped[j];
9715 }
9716 }
9717 return k;
9718}
9719
9720static Py_ssize_t
9721do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9722{
9723 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9724}
9725
9726static Py_ssize_t
9727do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9728{
9729 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9730}
9731
Benjamin Petersone51757f2012-01-12 21:10:29 -05009732static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009733do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9734{
9735 Py_ssize_t i, k = 0;
9736
9737 for (i = 0; i < length; i++) {
9738 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9739 Py_UCS4 mapped[3];
9740 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9741 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009742 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009743 res[k++] = mapped[j];
9744 }
9745 }
9746 return k;
9747}
9748
9749static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009750do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9751{
9752 Py_ssize_t i, k = 0;
9753 int previous_is_cased;
9754
9755 previous_is_cased = 0;
9756 for (i = 0; i < length; i++) {
9757 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9758 Py_UCS4 mapped[3];
9759 int n_res, j;
9760
9761 if (previous_is_cased)
9762 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9763 else
9764 n_res = _PyUnicode_ToTitleFull(c, mapped);
9765
9766 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009767 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009768 res[k++] = mapped[j];
9769 }
9770
9771 previous_is_cased = _PyUnicode_IsCased(c);
9772 }
9773 return k;
9774}
9775
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776static PyObject *
9777case_operation(PyObject *self,
9778 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9779{
9780 PyObject *res = NULL;
9781 Py_ssize_t length, newlength = 0;
9782 int kind, outkind;
9783 void *data, *outdata;
9784 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9785
Benjamin Petersoneea48462012-01-16 14:28:50 -05009786 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009787
9788 kind = PyUnicode_KIND(self);
9789 data = PyUnicode_DATA(self);
9790 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009791 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009792 PyErr_SetString(PyExc_OverflowError, "string is too long");
9793 return NULL;
9794 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009795 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009796 if (tmp == NULL)
9797 return PyErr_NoMemory();
9798 newlength = perform(kind, data, length, tmp, &maxchar);
9799 res = PyUnicode_New(newlength, maxchar);
9800 if (res == NULL)
9801 goto leave;
9802 tmpend = tmp + newlength;
9803 outdata = PyUnicode_DATA(res);
9804 outkind = PyUnicode_KIND(res);
9805 switch (outkind) {
9806 case PyUnicode_1BYTE_KIND:
9807 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9808 break;
9809 case PyUnicode_2BYTE_KIND:
9810 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9811 break;
9812 case PyUnicode_4BYTE_KIND:
9813 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9814 break;
9815 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009816 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009817 }
9818 leave:
9819 PyMem_FREE(tmp);
9820 return res;
9821}
9822
Tim Peters8ce9f162004-08-27 01:49:32 +00009823PyObject *
9824PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009826 PyObject *res;
9827 PyObject *fseq;
9828 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009829 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009831 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009832 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009833 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009834 }
9835
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009836 /* NOTE: the following code can't call back into Python code,
9837 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009838 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009839
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009840 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009841 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009842 res = _PyUnicode_JoinArray(separator, items, seqlen);
9843 Py_DECREF(fseq);
9844 return res;
9845}
9846
9847PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009848_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009849{
9850 PyObject *res = NULL; /* the result */
9851 PyObject *sep = NULL;
9852 Py_ssize_t seplen;
9853 PyObject *item;
9854 Py_ssize_t sz, i, res_offset;
9855 Py_UCS4 maxchar;
9856 Py_UCS4 item_maxchar;
9857 int use_memcpy;
9858 unsigned char *res_data = NULL, *sep_data = NULL;
9859 PyObject *last_obj;
9860 unsigned int kind = 0;
9861
Tim Peters05eba1f2004-08-27 21:32:02 +00009862 /* If empty sequence, return u"". */
9863 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009864 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009865 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009866
Tim Peters05eba1f2004-08-27 21:32:02 +00009867 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009868 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009869 if (seqlen == 1) {
9870 if (PyUnicode_CheckExact(items[0])) {
9871 res = items[0];
9872 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009873 return res;
9874 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009875 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009876 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009877 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009878 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009879 /* Set up sep and seplen */
9880 if (separator == NULL) {
9881 /* fall back to a blank space separator */
9882 sep = PyUnicode_FromOrdinal(' ');
9883 if (!sep)
9884 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009885 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009886 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009887 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009888 else {
9889 if (!PyUnicode_Check(separator)) {
9890 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009891 "separator: expected str instance,"
9892 " %.80s found",
9893 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009894 goto onError;
9895 }
9896 if (PyUnicode_READY(separator))
9897 goto onError;
9898 sep = separator;
9899 seplen = PyUnicode_GET_LENGTH(separator);
9900 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9901 /* inc refcount to keep this code path symmetric with the
9902 above case of a blank separator */
9903 Py_INCREF(sep);
9904 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009905 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009906 }
9907
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009908 /* There are at least two things to join, or else we have a subclass
9909 * of str in the sequence.
9910 * Do a pre-pass to figure out the total amount of space we'll
9911 * need (sz), and see whether all argument are strings.
9912 */
9913 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009914#ifdef Py_DEBUG
9915 use_memcpy = 0;
9916#else
9917 use_memcpy = 1;
9918#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009919 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009920 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009921 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009922 if (!PyUnicode_Check(item)) {
9923 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009924 "sequence item %zd: expected str instance,"
9925 " %.80s found",
9926 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 goto onError;
9928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 if (PyUnicode_READY(item) == -1)
9930 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009931 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009933 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009934 if (i != 0) {
9935 add_sz += seplen;
9936 }
9937 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009940 goto onError;
9941 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009942 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009943 if (use_memcpy && last_obj != NULL) {
9944 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9945 use_memcpy = 0;
9946 }
9947 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009948 }
Tim Petersced69f82003-09-16 20:30:58 +00009949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009951 if (res == NULL)
9952 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009953
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009954 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009955#ifdef Py_DEBUG
9956 use_memcpy = 0;
9957#else
9958 if (use_memcpy) {
9959 res_data = PyUnicode_1BYTE_DATA(res);
9960 kind = PyUnicode_KIND(res);
9961 if (seplen != 0)
9962 sep_data = PyUnicode_1BYTE_DATA(sep);
9963 }
9964#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009965 if (use_memcpy) {
9966 for (i = 0; i < seqlen; ++i) {
9967 Py_ssize_t itemlen;
9968 item = items[i];
9969
9970 /* Copy item, and maybe the separator. */
9971 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009972 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009973 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009974 kind * seplen);
9975 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009976 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009977
9978 itemlen = PyUnicode_GET_LENGTH(item);
9979 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009980 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009981 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009982 kind * itemlen);
9983 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009984 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009985 }
9986 assert(res_data == PyUnicode_1BYTE_DATA(res)
9987 + kind * PyUnicode_GET_LENGTH(res));
9988 }
9989 else {
9990 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9991 Py_ssize_t itemlen;
9992 item = items[i];
9993
9994 /* Copy item, and maybe the separator. */
9995 if (i && seplen != 0) {
9996 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9997 res_offset += seplen;
9998 }
9999
10000 itemlen = PyUnicode_GET_LENGTH(item);
10001 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010002 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010003 res_offset += itemlen;
10004 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010005 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010006 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010007 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010010 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012
Benjamin Peterson29060642009-01-31 22:14:21 +000010013 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010015 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 return NULL;
10017}
10018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019#define FILL(kind, data, value, start, length) \
10020 do { \
10021 Py_ssize_t i_ = 0; \
10022 assert(kind != PyUnicode_WCHAR_KIND); \
10023 switch ((kind)) { \
10024 case PyUnicode_1BYTE_KIND: { \
10025 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010026 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 break; \
10028 } \
10029 case PyUnicode_2BYTE_KIND: { \
10030 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10031 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10032 break; \
10033 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010034 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10036 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10037 break; \
10038 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010039 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 } \
10041 } while (0)
10042
Victor Stinnerd3f08822012-05-29 12:57:52 +020010043void
10044_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10045 Py_UCS4 fill_char)
10046{
10047 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10048 const void *data = PyUnicode_DATA(unicode);
10049 assert(PyUnicode_IS_READY(unicode));
10050 assert(unicode_modifiable(unicode));
10051 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10052 assert(start >= 0);
10053 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10054 FILL(kind, data, fill_char, start, length);
10055}
10056
Victor Stinner3fe55312012-01-04 00:33:50 +010010057Py_ssize_t
10058PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10059 Py_UCS4 fill_char)
10060{
10061 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010062
10063 if (!PyUnicode_Check(unicode)) {
10064 PyErr_BadInternalCall();
10065 return -1;
10066 }
10067 if (PyUnicode_READY(unicode) == -1)
10068 return -1;
10069 if (unicode_check_modifiable(unicode))
10070 return -1;
10071
Victor Stinnerd3f08822012-05-29 12:57:52 +020010072 if (start < 0) {
10073 PyErr_SetString(PyExc_IndexError, "string index out of range");
10074 return -1;
10075 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010076 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10077 PyErr_SetString(PyExc_ValueError,
10078 "fill character is bigger than "
10079 "the string maximum character");
10080 return -1;
10081 }
10082
10083 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10084 length = Py_MIN(maxlen, length);
10085 if (length <= 0)
10086 return 0;
10087
Victor Stinnerd3f08822012-05-29 12:57:52 +020010088 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010089 return length;
10090}
10091
Victor Stinner9310abb2011-10-05 00:59:23 +020010092static PyObject *
10093pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010094 Py_ssize_t left,
10095 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 PyObject *u;
10099 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010100 int kind;
10101 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
10103 if (left < 0)
10104 left = 0;
10105 if (right < 0)
10106 right = 0;
10107
Victor Stinnerc4b49542011-12-11 22:44:26 +010010108 if (left == 0 && right == 0)
10109 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10112 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010113 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10114 return NULL;
10115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010117 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010119 if (!u)
10120 return NULL;
10121
10122 kind = PyUnicode_KIND(u);
10123 data = PyUnicode_DATA(u);
10124 if (left)
10125 FILL(kind, data, fill, 0, left);
10126 if (right)
10127 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010128 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010129 assert(_PyUnicode_CheckConsistency(u, 1));
10130 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131}
10132
Alexander Belopolsky40018472011-02-26 01:02:56 +000010133PyObject *
10134PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010138 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Benjamin Petersonead6b532011-12-20 17:23:42 -060010141 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010143 if (PyUnicode_IS_ASCII(string))
10144 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010145 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010146 PyUnicode_GET_LENGTH(string), keepends);
10147 else
10148 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010149 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 break;
10152 case PyUnicode_2BYTE_KIND:
10153 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010154 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 PyUnicode_GET_LENGTH(string), keepends);
10156 break;
10157 case PyUnicode_4BYTE_KIND:
10158 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 PyUnicode_GET_LENGTH(string), keepends);
10161 break;
10162 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010163 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166}
10167
Alexander Belopolsky40018472011-02-26 01:02:56 +000010168static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010169split(PyObject *self,
10170 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010171 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010173 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 void *buf1, *buf2;
10175 Py_ssize_t len1, len2;
10176 PyObject* out;
10177
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010179 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 if (PyUnicode_READY(self) == -1)
10182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010185 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187 if (PyUnicode_IS_ASCII(self))
10188 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 PyUnicode_GET_LENGTH(self), maxcount
10191 );
10192 else
10193 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 PyUnicode_GET_LENGTH(self), maxcount
10196 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 case PyUnicode_2BYTE_KIND:
10198 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 PyUnicode_GET_LENGTH(self), maxcount
10201 );
10202 case PyUnicode_4BYTE_KIND:
10203 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010204 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 PyUnicode_GET_LENGTH(self), maxcount
10206 );
10207 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010208 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 }
10210
10211 if (PyUnicode_READY(substring) == -1)
10212 return NULL;
10213
10214 kind1 = PyUnicode_KIND(self);
10215 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 len1 = PyUnicode_GET_LENGTH(self);
10217 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010218 if (kind1 < kind2 || len1 < len2) {
10219 out = PyList_New(1);
10220 if (out == NULL)
10221 return NULL;
10222 Py_INCREF(self);
10223 PyList_SET_ITEM(out, 0, self);
10224 return out;
10225 }
10226 buf1 = PyUnicode_DATA(self);
10227 buf2 = PyUnicode_DATA(substring);
10228 if (kind2 != kind1) {
10229 buf2 = _PyUnicode_AsKind(substring, kind1);
10230 if (!buf2)
10231 return NULL;
10232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010234 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10237 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010239 else
10240 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 break;
10243 case PyUnicode_2BYTE_KIND:
10244 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 break;
10247 case PyUnicode_4BYTE_KIND:
10248 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 break;
10251 default:
10252 out = NULL;
10253 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010254 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 PyMem_Free(buf2);
10256 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257}
10258
Alexander Belopolsky40018472011-02-26 01:02:56 +000010259static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010260rsplit(PyObject *self,
10261 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010262 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010263{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010264 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 void *buf1, *buf2;
10266 Py_ssize_t len1, len2;
10267 PyObject* out;
10268
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010269 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010270 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 if (PyUnicode_READY(self) == -1)
10273 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010276 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 if (PyUnicode_IS_ASCII(self))
10279 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 PyUnicode_GET_LENGTH(self), maxcount
10282 );
10283 else
10284 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010285 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010286 PyUnicode_GET_LENGTH(self), maxcount
10287 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 case PyUnicode_2BYTE_KIND:
10289 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 PyUnicode_GET_LENGTH(self), maxcount
10292 );
10293 case PyUnicode_4BYTE_KIND:
10294 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 PyUnicode_GET_LENGTH(self), maxcount
10297 );
10298 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010299 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 }
10301
10302 if (PyUnicode_READY(substring) == -1)
10303 return NULL;
10304
10305 kind1 = PyUnicode_KIND(self);
10306 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 len1 = PyUnicode_GET_LENGTH(self);
10308 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010309 if (kind1 < kind2 || len1 < len2) {
10310 out = PyList_New(1);
10311 if (out == NULL)
10312 return NULL;
10313 Py_INCREF(self);
10314 PyList_SET_ITEM(out, 0, self);
10315 return out;
10316 }
10317 buf1 = PyUnicode_DATA(self);
10318 buf2 = PyUnicode_DATA(substring);
10319 if (kind2 != kind1) {
10320 buf2 = _PyUnicode_AsKind(substring, kind1);
10321 if (!buf2)
10322 return NULL;
10323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010325 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10328 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 else
10331 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 break;
10334 case PyUnicode_2BYTE_KIND:
10335 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 break;
10338 case PyUnicode_4BYTE_KIND:
10339 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 break;
10342 default:
10343 out = NULL;
10344 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010345 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 PyMem_Free(buf2);
10347 return out;
10348}
10349
10350static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010351anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10352 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010354 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10357 return asciilib_find(buf1, len1, buf2, len2, offset);
10358 else
10359 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 case PyUnicode_2BYTE_KIND:
10361 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10362 case PyUnicode_4BYTE_KIND:
10363 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10364 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010365 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366}
10367
10368static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10370 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010372 switch (kind) {
10373 case PyUnicode_1BYTE_KIND:
10374 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10375 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10376 else
10377 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10378 case PyUnicode_2BYTE_KIND:
10379 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10380 case PyUnicode_4BYTE_KIND:
10381 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10382 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010383 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010384}
10385
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010386static void
10387replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10388 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10389{
10390 int kind = PyUnicode_KIND(u);
10391 void *data = PyUnicode_DATA(u);
10392 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10393 if (kind == PyUnicode_1BYTE_KIND) {
10394 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10395 (Py_UCS1 *)data + len,
10396 u1, u2, maxcount);
10397 }
10398 else if (kind == PyUnicode_2BYTE_KIND) {
10399 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10400 (Py_UCS2 *)data + len,
10401 u1, u2, maxcount);
10402 }
10403 else {
10404 assert(kind == PyUnicode_4BYTE_KIND);
10405 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10406 (Py_UCS4 *)data + len,
10407 u1, u2, maxcount);
10408 }
10409}
10410
Alexander Belopolsky40018472011-02-26 01:02:56 +000010411static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412replace(PyObject *self, PyObject *str1,
10413 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 PyObject *u;
10416 char *sbuf = PyUnicode_DATA(self);
10417 char *buf1 = PyUnicode_DATA(str1);
10418 char *buf2 = PyUnicode_DATA(str2);
10419 int srelease = 0, release1 = 0, release2 = 0;
10420 int skind = PyUnicode_KIND(self);
10421 int kind1 = PyUnicode_KIND(str1);
10422 int kind2 = PyUnicode_KIND(str2);
10423 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10424 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10425 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010427 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
10429 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010432 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433
Victor Stinner59de0ee2011-10-07 10:01:28 +020010434 if (str1 == str2)
10435 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436
Victor Stinner49a0a212011-10-12 23:46:10 +020010437 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010438 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10439 if (maxchar < maxchar_str1)
10440 /* substring too wide to be present */
10441 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10443 /* Replacing str1 with str2 may cause a maxchar reduction in the
10444 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010445 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010446 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010451 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010454 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010455 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010456
Victor Stinner69ed0f42013-04-09 21:48:24 +020010457 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010459 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010461 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010465
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010466 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10467 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010468 }
10469 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 int rkind = skind;
10471 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010472 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (kind1 < rkind) {
10475 /* widen substring */
10476 buf1 = _PyUnicode_AsKind(str1, rkind);
10477 if (!buf1) goto error;
10478 release1 = 1;
10479 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010480 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010481 if (i < 0)
10482 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 if (rkind > kind2) {
10484 /* widen replacement */
10485 buf2 = _PyUnicode_AsKind(str2, rkind);
10486 if (!buf2) goto error;
10487 release2 = 1;
10488 }
10489 else if (rkind < kind2) {
10490 /* widen self and buf1 */
10491 rkind = kind2;
10492 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010493 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 sbuf = _PyUnicode_AsKind(self, rkind);
10495 if (!sbuf) goto error;
10496 srelease = 1;
10497 buf1 = _PyUnicode_AsKind(str1, rkind);
10498 if (!buf1) goto error;
10499 release1 = 1;
10500 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010501 u = PyUnicode_New(slen, maxchar);
10502 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010504 assert(PyUnicode_KIND(u) == rkind);
10505 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010506
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010507 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010508 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010509 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010511 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010513
10514 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010515 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010516 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010517 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010518 if (i == -1)
10519 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010520 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010522 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 }
10527 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010529 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 int rkind = skind;
10531 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010534 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 buf1 = _PyUnicode_AsKind(str1, rkind);
10536 if (!buf1) goto error;
10537 release1 = 1;
10538 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010539 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010540 if (n == 0)
10541 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 buf2 = _PyUnicode_AsKind(str2, rkind);
10545 if (!buf2) goto error;
10546 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 rkind = kind2;
10551 sbuf = _PyUnicode_AsKind(self, rkind);
10552 if (!sbuf) goto error;
10553 srelease = 1;
10554 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010555 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 buf1 = _PyUnicode_AsKind(str1, rkind);
10557 if (!buf1) goto error;
10558 release1 = 1;
10559 }
10560 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10561 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010562 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 PyErr_SetString(PyExc_OverflowError,
10564 "replace string is too long");
10565 goto error;
10566 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010567 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010568 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010569 _Py_INCREF_UNICODE_EMPTY();
10570 if (!unicode_empty)
10571 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 u = unicode_empty;
10573 goto done;
10574 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010575 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 PyErr_SetString(PyExc_OverflowError,
10577 "replace string is too long");
10578 goto error;
10579 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 u = PyUnicode_New(new_size, maxchar);
10581 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 assert(PyUnicode_KIND(u) == rkind);
10584 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 ires = i = 0;
10586 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010587 while (n-- > 0) {
10588 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010592 if (j == -1)
10593 break;
10594 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010596 memcpy(res + rkind * ires,
10597 sbuf + rkind * i,
10598 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 }
10601 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010603 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010605 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010612 memcpy(res + rkind * ires,
10613 sbuf + rkind * i,
10614 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 }
10616 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010617 /* interleave */
10618 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010621 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 if (--n <= 0)
10624 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010625 memcpy(res + rkind * ires,
10626 sbuf + rkind * i,
10627 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 ires++;
10629 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010631 memcpy(res + rkind * ires,
10632 sbuf + rkind * i,
10633 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 }
10636
10637 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010638 unicode_adjust_maxchar(&u);
10639 if (u == NULL)
10640 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010642
10643 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (srelease)
10645 PyMem_FREE(sbuf);
10646 if (release1)
10647 PyMem_FREE(buf1);
10648 if (release2)
10649 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010650 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (srelease)
10656 PyMem_FREE(sbuf);
10657 if (release1)
10658 PyMem_FREE(buf1);
10659 if (release2)
10660 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010661 return unicode_result_unchanged(self);
10662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 error:
10664 if (srelease && sbuf)
10665 PyMem_FREE(sbuf);
10666 if (release1 && buf1)
10667 PyMem_FREE(buf1);
10668 if (release2 && buf2)
10669 PyMem_FREE(buf2);
10670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671}
10672
10673/* --- Unicode Object Methods --------------------------------------------- */
10674
INADA Naoki3ae20562017-01-16 20:41:20 +090010675/*[clinic input]
10676str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677
INADA Naoki3ae20562017-01-16 20:41:20 +090010678Return a version of the string where each word is titlecased.
10679
10680More specifically, words start with uppercased characters and all remaining
10681cased characters have lower case.
10682[clinic start generated code]*/
10683
10684static PyObject *
10685unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010686/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010688 if (PyUnicode_READY(self) == -1)
10689 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010690 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691}
10692
INADA Naoki3ae20562017-01-16 20:41:20 +090010693/*[clinic input]
10694str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
INADA Naoki3ae20562017-01-16 20:41:20 +090010696Return a capitalized version of the string.
10697
10698More specifically, make the first character have upper case and the rest lower
10699case.
10700[clinic start generated code]*/
10701
10702static PyObject *
10703unicode_capitalize_impl(PyObject *self)
10704/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010706 if (PyUnicode_READY(self) == -1)
10707 return NULL;
10708 if (PyUnicode_GET_LENGTH(self) == 0)
10709 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010710 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711}
10712
INADA Naoki3ae20562017-01-16 20:41:20 +090010713/*[clinic input]
10714str.casefold as unicode_casefold
10715
10716Return a version of the string suitable for caseless comparisons.
10717[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010718
10719static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010720unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010721/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010722{
10723 if (PyUnicode_READY(self) == -1)
10724 return NULL;
10725 if (PyUnicode_IS_ASCII(self))
10726 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010727 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010728}
10729
10730
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010731/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010732
10733static int
10734convert_uc(PyObject *obj, void *addr)
10735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010737
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010738 if (!PyUnicode_Check(obj)) {
10739 PyErr_Format(PyExc_TypeError,
10740 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010741 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010742 return 0;
10743 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010744 if (PyUnicode_READY(obj) < 0)
10745 return 0;
10746 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 return 0;
10750 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010751 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010753}
10754
INADA Naoki3ae20562017-01-16 20:41:20 +090010755/*[clinic input]
10756str.center as unicode_center
10757
10758 width: Py_ssize_t
10759 fillchar: Py_UCS4 = ' '
10760 /
10761
10762Return a centered string of length width.
10763
10764Padding is done using the specified fill character (default is a space).
10765[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
10767static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010768unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10769/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010771 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
Benjamin Petersonbac79492012-01-14 13:34:47 -050010773 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 return NULL;
10775
Victor Stinnerc4b49542011-12-11 22:44:26 +010010776 if (PyUnicode_GET_LENGTH(self) >= width)
10777 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
Victor Stinnerc4b49542011-12-11 22:44:26 +010010779 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 left = marg / 2 + (marg & width & 1);
10781
Victor Stinner9310abb2011-10-05 00:59:23 +020010782 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783}
10784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785/* This function assumes that str1 and str2 are readied by the caller. */
10786
Marc-André Lemburge5034372000-08-08 08:04:29 +000010787static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010788unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010789{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010790#define COMPARE(TYPE1, TYPE2) \
10791 do { \
10792 TYPE1* p1 = (TYPE1 *)data1; \
10793 TYPE2* p2 = (TYPE2 *)data2; \
10794 TYPE1* end = p1 + len; \
10795 Py_UCS4 c1, c2; \
10796 for (; p1 != end; p1++, p2++) { \
10797 c1 = *p1; \
10798 c2 = *p2; \
10799 if (c1 != c2) \
10800 return (c1 < c2) ? -1 : 1; \
10801 } \
10802 } \
10803 while (0)
10804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 int kind1, kind2;
10806 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010807 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 kind1 = PyUnicode_KIND(str1);
10810 kind2 = PyUnicode_KIND(str2);
10811 data1 = PyUnicode_DATA(str1);
10812 data2 = PyUnicode_DATA(str2);
10813 len1 = PyUnicode_GET_LENGTH(str1);
10814 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010815 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010816
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010817 switch(kind1) {
10818 case PyUnicode_1BYTE_KIND:
10819 {
10820 switch(kind2) {
10821 case PyUnicode_1BYTE_KIND:
10822 {
10823 int cmp = memcmp(data1, data2, len);
10824 /* normalize result of memcmp() into the range [-1; 1] */
10825 if (cmp < 0)
10826 return -1;
10827 if (cmp > 0)
10828 return 1;
10829 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010830 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010831 case PyUnicode_2BYTE_KIND:
10832 COMPARE(Py_UCS1, Py_UCS2);
10833 break;
10834 case PyUnicode_4BYTE_KIND:
10835 COMPARE(Py_UCS1, Py_UCS4);
10836 break;
10837 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010838 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010839 }
10840 break;
10841 }
10842 case PyUnicode_2BYTE_KIND:
10843 {
10844 switch(kind2) {
10845 case PyUnicode_1BYTE_KIND:
10846 COMPARE(Py_UCS2, Py_UCS1);
10847 break;
10848 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010849 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010850 COMPARE(Py_UCS2, Py_UCS2);
10851 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010852 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010853 case PyUnicode_4BYTE_KIND:
10854 COMPARE(Py_UCS2, Py_UCS4);
10855 break;
10856 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010857 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010858 }
10859 break;
10860 }
10861 case PyUnicode_4BYTE_KIND:
10862 {
10863 switch(kind2) {
10864 case PyUnicode_1BYTE_KIND:
10865 COMPARE(Py_UCS4, Py_UCS1);
10866 break;
10867 case PyUnicode_2BYTE_KIND:
10868 COMPARE(Py_UCS4, Py_UCS2);
10869 break;
10870 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010871 {
10872#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10873 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10874 /* normalize result of wmemcmp() into the range [-1; 1] */
10875 if (cmp < 0)
10876 return -1;
10877 if (cmp > 0)
10878 return 1;
10879#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010881#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010883 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010885 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010886 }
10887 break;
10888 }
10889 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010890 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010891 }
10892
Victor Stinner770e19e2012-10-04 22:59:45 +020010893 if (len1 == len2)
10894 return 0;
10895 if (len1 < len2)
10896 return -1;
10897 else
10898 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010899
10900#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010901}
10902
Benjamin Peterson621b4302016-09-09 13:54:34 -070010903static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010904unicode_compare_eq(PyObject *str1, PyObject *str2)
10905{
10906 int kind;
10907 void *data1, *data2;
10908 Py_ssize_t len;
10909 int cmp;
10910
Victor Stinnere5567ad2012-10-23 02:48:49 +020010911 len = PyUnicode_GET_LENGTH(str1);
10912 if (PyUnicode_GET_LENGTH(str2) != len)
10913 return 0;
10914 kind = PyUnicode_KIND(str1);
10915 if (PyUnicode_KIND(str2) != kind)
10916 return 0;
10917 data1 = PyUnicode_DATA(str1);
10918 data2 = PyUnicode_DATA(str2);
10919
10920 cmp = memcmp(data1, data2, len * kind);
10921 return (cmp == 0);
10922}
10923
10924
Alexander Belopolsky40018472011-02-26 01:02:56 +000010925int
10926PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10929 if (PyUnicode_READY(left) == -1 ||
10930 PyUnicode_READY(right) == -1)
10931 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010932
10933 /* a string is equal to itself */
10934 if (left == right)
10935 return 0;
10936
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010937 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010939 PyErr_Format(PyExc_TypeError,
10940 "Can't compare %.100s and %.100s",
10941 left->ob_type->tp_name,
10942 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943 return -1;
10944}
10945
Martin v. Löwis5b222132007-06-10 09:51:05 +000010946int
10947PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10948{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_ssize_t i;
10950 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010952 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953
Victor Stinner910337b2011-10-03 03:20:16 +020010954 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010955 if (!PyUnicode_IS_READY(uni)) {
10956 const wchar_t *ws = _PyUnicode_WSTR(uni);
10957 /* Compare Unicode string and source character set string */
10958 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10959 if (chr != ustr[i])
10960 return (chr < ustr[i]) ? -1 : 1;
10961 }
10962 /* This check keeps Python strings that end in '\0' from comparing equal
10963 to C strings identical up to that point. */
10964 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10965 return 1; /* uni is longer */
10966 if (ustr[i])
10967 return -1; /* str is longer */
10968 return 0;
10969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010971 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010972 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010973 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010974 size_t len, len2 = strlen(str);
10975 int cmp;
10976
10977 len = Py_MIN(len1, len2);
10978 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010979 if (cmp != 0) {
10980 if (cmp < 0)
10981 return -1;
10982 else
10983 return 1;
10984 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010985 if (len1 > len2)
10986 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010987 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010988 return -1; /* str is longer */
10989 return 0;
10990 }
10991 else {
10992 void *data = PyUnicode_DATA(uni);
10993 /* Compare Unicode string and source character set string */
10994 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010995 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010996 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10997 /* This check keeps Python strings that end in '\0' from comparing equal
10998 to C strings identical up to that point. */
10999 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11000 return 1; /* uni is longer */
11001 if (str[i])
11002 return -1; /* str is longer */
11003 return 0;
11004 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011005}
11006
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011007static int
11008non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11009{
11010 size_t i, len;
11011 const wchar_t *p;
11012 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11013 if (strlen(str) != len)
11014 return 0;
11015 p = _PyUnicode_WSTR(unicode);
11016 assert(p);
11017 for (i = 0; i < len; i++) {
11018 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011019 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011020 return 0;
11021 }
11022 return 1;
11023}
11024
11025int
11026_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11027{
11028 size_t len;
11029 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011030 assert(str);
11031#ifndef NDEBUG
11032 for (const char *p = str; *p; p++) {
11033 assert((unsigned char)*p < 128);
11034 }
11035#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011036 if (PyUnicode_READY(unicode) == -1) {
11037 /* Memory error or bad data */
11038 PyErr_Clear();
11039 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11040 }
11041 if (!PyUnicode_IS_ASCII(unicode))
11042 return 0;
11043 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11044 return strlen(str) == len &&
11045 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11046}
11047
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011048int
11049_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11050{
11051 PyObject *right_uni;
11052 Py_hash_t hash;
11053
11054 assert(_PyUnicode_CHECK(left));
11055 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011056#ifndef NDEBUG
11057 for (const char *p = right->string; *p; p++) {
11058 assert((unsigned char)*p < 128);
11059 }
11060#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011061
11062 if (PyUnicode_READY(left) == -1) {
11063 /* memory error or bad data */
11064 PyErr_Clear();
11065 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11066 }
11067
11068 if (!PyUnicode_IS_ASCII(left))
11069 return 0;
11070
11071 right_uni = _PyUnicode_FromId(right); /* borrowed */
11072 if (right_uni == NULL) {
11073 /* memory error or bad data */
11074 PyErr_Clear();
11075 return _PyUnicode_EqualToASCIIString(left, right->string);
11076 }
11077
11078 if (left == right_uni)
11079 return 1;
11080
11081 if (PyUnicode_CHECK_INTERNED(left))
11082 return 0;
11083
INADA Naoki7cc95f52018-01-28 02:07:09 +090011084 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011085 hash = _PyUnicode_HASH(left);
11086 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11087 return 0;
11088
11089 return unicode_compare_eq(left, right_uni);
11090}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011091
Alexander Belopolsky40018472011-02-26 01:02:56 +000011092PyObject *
11093PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011094{
11095 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011096
Victor Stinnere5567ad2012-10-23 02:48:49 +020011097 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11098 Py_RETURN_NOTIMPLEMENTED;
11099
11100 if (PyUnicode_READY(left) == -1 ||
11101 PyUnicode_READY(right) == -1)
11102 return NULL;
11103
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011104 if (left == right) {
11105 switch (op) {
11106 case Py_EQ:
11107 case Py_LE:
11108 case Py_GE:
11109 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011110 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011111 case Py_NE:
11112 case Py_LT:
11113 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011114 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011115 default:
11116 PyErr_BadArgument();
11117 return NULL;
11118 }
11119 }
11120 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011121 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011122 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011123 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011124 }
11125 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011126 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011127 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011128 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011129}
11130
Alexander Belopolsky40018472011-02-26 01:02:56 +000011131int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011132_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11133{
11134 return unicode_eq(aa, bb);
11135}
11136
11137int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011138PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011139{
Victor Stinner77282cb2013-04-14 19:22:47 +020011140 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 void *buf1, *buf2;
11142 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011143 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011144
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011145 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011147 "'in <string>' requires string as left operand, not %.100s",
11148 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011150 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011151 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011153 if (ensure_unicode(str) < 0)
11154 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011157 kind2 = PyUnicode_KIND(substr);
11158 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011159 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011161 len2 = PyUnicode_GET_LENGTH(substr);
11162 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011163 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011164 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011165 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 if (len2 == 1) {
11167 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11168 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011169 return result;
11170 }
11171 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011172 buf2 = _PyUnicode_AsKind(substr, kind1);
11173 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011174 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176
Victor Stinner77282cb2013-04-14 19:22:47 +020011177 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 case PyUnicode_1BYTE_KIND:
11179 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11180 break;
11181 case PyUnicode_2BYTE_KIND:
11182 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11183 break;
11184 case PyUnicode_4BYTE_KIND:
11185 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11186 break;
11187 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011188 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011190
Victor Stinner77282cb2013-04-14 19:22:47 +020011191 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 PyMem_Free(buf2);
11193
Guido van Rossum403d68b2000-03-13 15:55:09 +000011194 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011195}
11196
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197/* Concat to string or Unicode object giving a new Unicode object. */
11198
Alexander Belopolsky40018472011-02-26 01:02:56 +000011199PyObject *
11200PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011203 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011206 if (ensure_unicode(left) < 0)
11207 return NULL;
11208
11209 if (!PyUnicode_Check(right)) {
11210 PyErr_Format(PyExc_TypeError,
11211 "can only concatenate str (not \"%.200s\") to str",
11212 right->ob_type->tp_name);
11213 return NULL;
11214 }
11215 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217
11218 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011219 if (left == unicode_empty)
11220 return PyUnicode_FromObject(right);
11221 if (right == unicode_empty)
11222 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011224 left_len = PyUnicode_GET_LENGTH(left);
11225 right_len = PyUnicode_GET_LENGTH(right);
11226 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011227 PyErr_SetString(PyExc_OverflowError,
11228 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011230 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011232
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011233 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11234 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011235 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011238 result = PyUnicode_New(new_len, maxchar);
11239 if (result == NULL)
11240 return NULL;
11241 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11242 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11243 assert(_PyUnicode_CheckConsistency(result, 1));
11244 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245}
11246
Walter Dörwald1ab83302007-05-18 17:15:44 +000011247void
Victor Stinner23e56682011-10-03 03:54:37 +020011248PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011249{
Victor Stinner23e56682011-10-03 03:54:37 +020011250 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011251 Py_UCS4 maxchar, maxchar2;
11252 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011253
11254 if (p_left == NULL) {
11255 if (!PyErr_Occurred())
11256 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011257 return;
11258 }
Victor Stinner23e56682011-10-03 03:54:37 +020011259 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011260 if (right == NULL || left == NULL
11261 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011262 if (!PyErr_Occurred())
11263 PyErr_BadInternalCall();
11264 goto error;
11265 }
11266
Benjamin Petersonbac79492012-01-14 13:34:47 -050011267 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011268 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011269 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011270 goto error;
11271
Victor Stinner488fa492011-12-12 00:01:39 +010011272 /* Shortcuts */
11273 if (left == unicode_empty) {
11274 Py_DECREF(left);
11275 Py_INCREF(right);
11276 *p_left = right;
11277 return;
11278 }
11279 if (right == unicode_empty)
11280 return;
11281
11282 left_len = PyUnicode_GET_LENGTH(left);
11283 right_len = PyUnicode_GET_LENGTH(right);
11284 if (left_len > PY_SSIZE_T_MAX - right_len) {
11285 PyErr_SetString(PyExc_OverflowError,
11286 "strings are too large to concat");
11287 goto error;
11288 }
11289 new_len = left_len + right_len;
11290
11291 if (unicode_modifiable(left)
11292 && PyUnicode_CheckExact(right)
11293 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011294 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11295 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011296 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011297 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011298 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11299 {
11300 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011301 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011302 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011303
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011304 /* copy 'right' into the newly allocated area of 'left' */
11305 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011306 }
Victor Stinner488fa492011-12-12 00:01:39 +010011307 else {
11308 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11309 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011310 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011311
Victor Stinner488fa492011-12-12 00:01:39 +010011312 /* Concat the two Unicode strings */
11313 res = PyUnicode_New(new_len, maxchar);
11314 if (res == NULL)
11315 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011316 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11317 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011318 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011319 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011320 }
11321 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011322 return;
11323
11324error:
Victor Stinner488fa492011-12-12 00:01:39 +010011325 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011326}
11327
11328void
11329PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011331 PyUnicode_Append(pleft, right);
11332 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011333}
11334
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011335/*
11336Wraps stringlib_parse_args_finds() and additionally ensures that the
11337first argument is a unicode object.
11338*/
11339
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011340static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011341parse_args_finds_unicode(const char * function_name, PyObject *args,
11342 PyObject **substring,
11343 Py_ssize_t *start, Py_ssize_t *end)
11344{
11345 if(stringlib_parse_args_finds(function_name, args, substring,
11346 start, end)) {
11347 if (ensure_unicode(*substring) < 0)
11348 return 0;
11349 return 1;
11350 }
11351 return 0;
11352}
11353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011354PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011357Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011358string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011359interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
11361static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011362unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011364 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011365 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011366 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011368 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 void *buf1, *buf2;
11370 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011372 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 kind1 = PyUnicode_KIND(self);
11376 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011378 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 len1 = PyUnicode_GET_LENGTH(self);
11381 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011383 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011384 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011385
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011386 buf1 = PyUnicode_DATA(self);
11387 buf2 = PyUnicode_DATA(substring);
11388 if (kind2 != kind1) {
11389 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011390 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011391 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011392 }
11393 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 case PyUnicode_1BYTE_KIND:
11395 iresult = ucs1lib_count(
11396 ((Py_UCS1*)buf1) + start, end - start,
11397 buf2, len2, PY_SSIZE_T_MAX
11398 );
11399 break;
11400 case PyUnicode_2BYTE_KIND:
11401 iresult = ucs2lib_count(
11402 ((Py_UCS2*)buf1) + start, end - start,
11403 buf2, len2, PY_SSIZE_T_MAX
11404 );
11405 break;
11406 case PyUnicode_4BYTE_KIND:
11407 iresult = ucs4lib_count(
11408 ((Py_UCS4*)buf1) + start, end - start,
11409 buf2, len2, PY_SSIZE_T_MAX
11410 );
11411 break;
11412 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011413 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 }
11415
11416 result = PyLong_FromSsize_t(iresult);
11417
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011418 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 return result;
11422}
11423
INADA Naoki3ae20562017-01-16 20:41:20 +090011424/*[clinic input]
11425str.encode as unicode_encode
11426
11427 encoding: str(c_default="NULL") = 'utf-8'
11428 The encoding in which to encode the string.
11429 errors: str(c_default="NULL") = 'strict'
11430 The error handling scheme to use for encoding errors.
11431 The default is 'strict' meaning that encoding errors raise a
11432 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11433 'xmlcharrefreplace' as well as any other name registered with
11434 codecs.register_error that can handle UnicodeEncodeErrors.
11435
11436Encode the string using the codec registered for encoding.
11437[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011440unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011441/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011443 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011444}
11445
INADA Naoki3ae20562017-01-16 20:41:20 +090011446/*[clinic input]
11447str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
INADA Naoki3ae20562017-01-16 20:41:20 +090011449 tabsize: int = 8
11450
11451Return a copy where all tab characters are expanded using spaces.
11452
11453If tabsize is not given, a tab size of 8 characters is assumed.
11454[clinic start generated code]*/
11455
11456static PyObject *
11457unicode_expandtabs_impl(PyObject *self, int tabsize)
11458/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011460 Py_ssize_t i, j, line_pos, src_len, incr;
11461 Py_UCS4 ch;
11462 PyObject *u;
11463 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011464 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011465 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Antoine Pitrou22425222011-10-04 19:10:51 +020011467 if (PyUnicode_READY(self) == -1)
11468 return NULL;
11469
Thomas Wouters7e474022000-07-16 12:04:32 +000011470 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011471 src_len = PyUnicode_GET_LENGTH(self);
11472 i = j = line_pos = 0;
11473 kind = PyUnicode_KIND(self);
11474 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011475 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011476 for (; i < src_len; i++) {
11477 ch = PyUnicode_READ(kind, src_data, i);
11478 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011479 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011483 goto overflow;
11484 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011486 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011490 goto overflow;
11491 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 if (ch == '\n' || ch == '\r')
11494 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011496 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011497 if (!found)
11498 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011499
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011501 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 if (!u)
11503 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
Antoine Pitroue71d5742011-10-04 15:55:09 +020011506 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Antoine Pitroue71d5742011-10-04 15:55:09 +020011508 for (; i < src_len; i++) {
11509 ch = PyUnicode_READ(kind, src_data, i);
11510 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011512 incr = tabsize - (line_pos % tabsize);
11513 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011514 FILL(kind, dest_data, ' ', j, incr);
11515 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011517 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011519 line_pos++;
11520 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011521 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011522 if (ch == '\n' || ch == '\r')
11523 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011525 }
11526 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011527 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011528
Antoine Pitroue71d5742011-10-04 15:55:09 +020011529 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011530 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532}
11533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011534PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536\n\
11537Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011538such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539arguments start and end are interpreted as in slice notation.\n\
11540\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011541Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
11543static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011546 /* initialize variables to prevent gcc warning */
11547 PyObject *substring = NULL;
11548 Py_ssize_t start = 0;
11549 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011550 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011552 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011555 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011558 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (result == -2)
11561 return NULL;
11562
Christian Heimes217cfd12007-12-02 14:31:20 +000011563 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564}
11565
11566static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011567unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011569 void *data;
11570 enum PyUnicode_Kind kind;
11571 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011572
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011573 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011574 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011576 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011577 if (PyUnicode_READY(self) == -1) {
11578 return NULL;
11579 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011580 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11581 PyErr_SetString(PyExc_IndexError, "string index out of range");
11582 return NULL;
11583 }
11584 kind = PyUnicode_KIND(self);
11585 data = PyUnicode_DATA(self);
11586 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011587 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588}
11589
Guido van Rossumc2504932007-09-18 19:42:40 +000011590/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011591 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011592static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011593unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594{
Guido van Rossumc2504932007-09-18 19:42:40 +000011595 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011596 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011597
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011598#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011599 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011600#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 if (_PyUnicode_HASH(self) != -1)
11602 return _PyUnicode_HASH(self);
11603 if (PyUnicode_READY(self) == -1)
11604 return -1;
11605 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011606 /*
11607 We make the hash of the empty string be 0, rather than using
11608 (prefix ^ suffix), since this slightly obfuscates the hash secret
11609 */
11610 if (len == 0) {
11611 _PyUnicode_HASH(self) = 0;
11612 return 0;
11613 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011614 x = _Py_HashBytes(PyUnicode_DATA(self),
11615 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011617 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618}
11619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011620PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622\n\
oldkaa0735f2018-02-02 16:52:55 +080011623Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011624such that sub is contained within S[start:end]. Optional\n\
11625arguments start and end are interpreted as in slice notation.\n\
11626\n\
11627Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011632 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011633 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011634 PyObject *substring = NULL;
11635 Py_ssize_t start = 0;
11636 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011638 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 if (result == -2)
11647 return NULL;
11648
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649 if (result < 0) {
11650 PyErr_SetString(PyExc_ValueError, "substring not found");
11651 return NULL;
11652 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011653
Christian Heimes217cfd12007-12-02 14:31:20 +000011654 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655}
11656
INADA Naoki3ae20562017-01-16 20:41:20 +090011657/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011658str.isascii as unicode_isascii
11659
11660Return True if all characters in the string are ASCII, False otherwise.
11661
11662ASCII characters have code points in the range U+0000-U+007F.
11663Empty string is ASCII too.
11664[clinic start generated code]*/
11665
11666static PyObject *
11667unicode_isascii_impl(PyObject *self)
11668/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11669{
11670 if (PyUnicode_READY(self) == -1) {
11671 return NULL;
11672 }
11673 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11674}
11675
11676/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011677str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678
INADA Naoki3ae20562017-01-16 20:41:20 +090011679Return True if the string is a lowercase string, False otherwise.
11680
11681A string is lowercase if all cased characters in the string are lowercase and
11682there is at least one cased character in the string.
11683[clinic start generated code]*/
11684
11685static PyObject *
11686unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011687/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 Py_ssize_t i, length;
11690 int kind;
11691 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 int cased;
11693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 if (PyUnicode_READY(self) == -1)
11695 return NULL;
11696 length = PyUnicode_GET_LENGTH(self);
11697 kind = PyUnicode_KIND(self);
11698 data = PyUnicode_DATA(self);
11699
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (length == 1)
11702 return PyBool_FromLong(
11703 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011705 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011707 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011708
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 for (i = 0; i < length; i++) {
11711 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011712
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011714 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 else if (!cased && Py_UNICODE_ISLOWER(ch))
11716 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011718 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719}
11720
INADA Naoki3ae20562017-01-16 20:41:20 +090011721/*[clinic input]
11722str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
INADA Naoki3ae20562017-01-16 20:41:20 +090011724Return True if the string is an uppercase string, False otherwise.
11725
11726A string is uppercase if all cased characters in the string are uppercase and
11727there is at least one cased character in the string.
11728[clinic start generated code]*/
11729
11730static PyObject *
11731unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011732/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 int cased;
11738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (PyUnicode_READY(self) == -1)
11740 return NULL;
11741 length = PyUnicode_GET_LENGTH(self);
11742 kind = PyUnicode_KIND(self);
11743 data = PyUnicode_DATA(self);
11744
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (length == 1)
11747 return PyBool_FromLong(
11748 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011750 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011752 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011753
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 for (i = 0; i < length; i++) {
11756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011757
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011759 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 else if (!cased && Py_UNICODE_ISUPPER(ch))
11761 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011763 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764}
11765
INADA Naoki3ae20562017-01-16 20:41:20 +090011766/*[clinic input]
11767str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
INADA Naoki3ae20562017-01-16 20:41:20 +090011769Return True if the string is a title-cased string, False otherwise.
11770
11771In a title-cased string, upper- and title-case characters may only
11772follow uncased characters and lowercase characters only cased ones.
11773[clinic start generated code]*/
11774
11775static PyObject *
11776unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011777/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 Py_ssize_t i, length;
11780 int kind;
11781 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 int cased, previous_is_cased;
11783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 if (PyUnicode_READY(self) == -1)
11785 return NULL;
11786 length = PyUnicode_GET_LENGTH(self);
11787 kind = PyUnicode_KIND(self);
11788 data = PyUnicode_DATA(self);
11789
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (length == 1) {
11792 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11793 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11794 (Py_UNICODE_ISUPPER(ch) != 0));
11795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011797 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011799 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011800
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 cased = 0;
11802 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 for (i = 0; i < length; i++) {
11804 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011805
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11807 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011808 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011809 previous_is_cased = 1;
11810 cased = 1;
11811 }
11812 else if (Py_UNICODE_ISLOWER(ch)) {
11813 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011814 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 previous_is_cased = 1;
11816 cased = 1;
11817 }
11818 else
11819 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011821 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822}
11823
INADA Naoki3ae20562017-01-16 20:41:20 +090011824/*[clinic input]
11825str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826
INADA Naoki3ae20562017-01-16 20:41:20 +090011827Return True if the string is a whitespace string, False otherwise.
11828
11829A string is whitespace if all characters in the string are whitespace and there
11830is at least one character in the string.
11831[clinic start generated code]*/
11832
11833static PyObject *
11834unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011835/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 Py_ssize_t i, length;
11838 int kind;
11839 void *data;
11840
11841 if (PyUnicode_READY(self) == -1)
11842 return NULL;
11843 length = PyUnicode_GET_LENGTH(self);
11844 kind = PyUnicode_KIND(self);
11845 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 if (length == 1)
11849 return PyBool_FromLong(
11850 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011852 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011854 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 for (i = 0; i < length; i++) {
11857 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011858 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011859 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011861 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862}
11863
INADA Naoki3ae20562017-01-16 20:41:20 +090011864/*[clinic input]
11865str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011866
INADA Naoki3ae20562017-01-16 20:41:20 +090011867Return True if the string is an alphabetic string, False otherwise.
11868
11869A string is alphabetic if all characters in the string are alphabetic and there
11870is at least one character in the string.
11871[clinic start generated code]*/
11872
11873static PyObject *
11874unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011875/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 Py_ssize_t i, length;
11878 int kind;
11879 void *data;
11880
11881 if (PyUnicode_READY(self) == -1)
11882 return NULL;
11883 length = PyUnicode_GET_LENGTH(self);
11884 kind = PyUnicode_KIND(self);
11885 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011886
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011887 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (length == 1)
11889 return PyBool_FromLong(
11890 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011891
11892 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 for (i = 0; i < length; i++) {
11897 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011898 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011899 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011900 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011901}
11902
INADA Naoki3ae20562017-01-16 20:41:20 +090011903/*[clinic input]
11904str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906Return True if the string is an alpha-numeric string, False otherwise.
11907
11908A string is alpha-numeric if all characters in the string are alpha-numeric and
11909there is at least one character in the string.
11910[clinic start generated code]*/
11911
11912static PyObject *
11913unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011914/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 int kind;
11917 void *data;
11918 Py_ssize_t len, i;
11919
11920 if (PyUnicode_READY(self) == -1)
11921 return NULL;
11922
11923 kind = PyUnicode_KIND(self);
11924 data = PyUnicode_DATA(self);
11925 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011926
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011927 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (len == 1) {
11929 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11930 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11931 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011932
11933 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011935 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 for (i = 0; i < len; i++) {
11938 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011939 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011942 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011943}
11944
INADA Naoki3ae20562017-01-16 20:41:20 +090011945/*[clinic input]
11946str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
INADA Naoki3ae20562017-01-16 20:41:20 +090011948Return True if the string is a decimal string, False otherwise.
11949
11950A string is a decimal string if all characters in the string are decimal and
11951there is at least one character in the string.
11952[clinic start generated code]*/
11953
11954static PyObject *
11955unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011956/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 Py_ssize_t i, length;
11959 int kind;
11960 void *data;
11961
11962 if (PyUnicode_READY(self) == -1)
11963 return NULL;
11964 length = PyUnicode_GET_LENGTH(self);
11965 kind = PyUnicode_KIND(self);
11966 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 if (length == 1)
11970 return PyBool_FromLong(
11971 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011973 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011975 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 for (i = 0; i < length; i++) {
11978 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011979 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011981 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982}
11983
INADA Naoki3ae20562017-01-16 20:41:20 +090011984/*[clinic input]
11985str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
INADA Naoki3ae20562017-01-16 20:41:20 +090011987Return True if the string is a digit string, False otherwise.
11988
11989A string is a digit string if all characters in the string are digits and there
11990is at least one character in the string.
11991[clinic start generated code]*/
11992
11993static PyObject *
11994unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011995/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 Py_ssize_t i, length;
11998 int kind;
11999 void *data;
12000
12001 if (PyUnicode_READY(self) == -1)
12002 return NULL;
12003 length = PyUnicode_GET_LENGTH(self);
12004 kind = PyUnicode_KIND(self);
12005 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (length == 1) {
12009 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12010 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012013 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012015 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 for (i = 0; i < length; i++) {
12018 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012019 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012021 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022}
12023
INADA Naoki3ae20562017-01-16 20:41:20 +090012024/*[clinic input]
12025str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
INADA Naoki3ae20562017-01-16 20:41:20 +090012027Return True if the string is a numeric string, False otherwise.
12028
12029A string is numeric if all characters in the string are numeric and there is at
12030least one character in the string.
12031[clinic start generated code]*/
12032
12033static PyObject *
12034unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012035/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 Py_ssize_t i, length;
12038 int kind;
12039 void *data;
12040
12041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 length = PyUnicode_GET_LENGTH(self);
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 1)
12049 return PyBool_FromLong(
12050 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012052 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012054 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 for (i = 0; i < length; i++) {
12057 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012058 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012060 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061}
12062
Martin v. Löwis47383402007-08-15 07:32:56 +000012063int
12064PyUnicode_IsIdentifier(PyObject *self)
12065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 int kind;
12067 void *data;
12068 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012069 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if (PyUnicode_READY(self) == -1) {
12072 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 }
12075
12076 /* Special case for empty strings */
12077 if (PyUnicode_GET_LENGTH(self) == 0)
12078 return 0;
12079 kind = PyUnicode_KIND(self);
12080 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012081
12082 /* PEP 3131 says that the first character must be in
12083 XID_Start and subsequent characters in XID_Continue,
12084 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012085 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012086 letters, digits, underscore). However, given the current
12087 definition of XID_Start and XID_Continue, it is sufficient
12088 to check just for these, except that _ must be allowed
12089 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012091 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012092 return 0;
12093
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012094 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012097 return 1;
12098}
12099
INADA Naoki3ae20562017-01-16 20:41:20 +090012100/*[clinic input]
12101str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012102
INADA Naoki3ae20562017-01-16 20:41:20 +090012103Return True if the string is a valid Python identifier, False otherwise.
12104
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012105Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012106such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012107[clinic start generated code]*/
12108
12109static PyObject *
12110unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012111/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012112{
12113 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12114}
12115
INADA Naoki3ae20562017-01-16 20:41:20 +090012116/*[clinic input]
12117str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012118
INADA Naoki3ae20562017-01-16 20:41:20 +090012119Return True if the string is printable, False otherwise.
12120
12121A string is printable if all of its characters are considered printable in
12122repr() or if it is empty.
12123[clinic start generated code]*/
12124
12125static PyObject *
12126unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012127/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 Py_ssize_t i, length;
12130 int kind;
12131 void *data;
12132
12133 if (PyUnicode_READY(self) == -1)
12134 return NULL;
12135 length = PyUnicode_GET_LENGTH(self);
12136 kind = PyUnicode_KIND(self);
12137 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012138
12139 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (length == 1)
12141 return PyBool_FromLong(
12142 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 for (i = 0; i < length; i++) {
12145 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012146 Py_RETURN_FALSE;
12147 }
12148 }
12149 Py_RETURN_TRUE;
12150}
12151
INADA Naoki3ae20562017-01-16 20:41:20 +090012152/*[clinic input]
12153str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
INADA Naoki3ae20562017-01-16 20:41:20 +090012155 iterable: object
12156 /
12157
12158Concatenate any number of strings.
12159
Martin Panter91a88662017-01-24 00:30:06 +000012160The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012161The result is returned as a new string.
12162
12163Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12164[clinic start generated code]*/
12165
12166static PyObject *
12167unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012168/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169{
INADA Naoki3ae20562017-01-16 20:41:20 +090012170 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171}
12172
Martin v. Löwis18e16552006-02-15 17:27:45 +000012173static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012174unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (PyUnicode_READY(self) == -1)
12177 return -1;
12178 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179}
12180
INADA Naoki3ae20562017-01-16 20:41:20 +090012181/*[clinic input]
12182str.ljust as unicode_ljust
12183
12184 width: Py_ssize_t
12185 fillchar: Py_UCS4 = ' '
12186 /
12187
12188Return a left-justified string of length width.
12189
12190Padding is done using the specified fill character (default is a space).
12191[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
12193static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012194unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12195/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012197 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
Victor Stinnerc4b49542011-12-11 22:44:26 +010012200 if (PyUnicode_GET_LENGTH(self) >= width)
12201 return unicode_result_unchanged(self);
12202
12203 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204}
12205
INADA Naoki3ae20562017-01-16 20:41:20 +090012206/*[clinic input]
12207str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
INADA Naoki3ae20562017-01-16 20:41:20 +090012209Return a copy of the string converted to lowercase.
12210[clinic start generated code]*/
12211
12212static PyObject *
12213unicode_lower_impl(PyObject *self)
12214/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012216 if (PyUnicode_READY(self) == -1)
12217 return NULL;
12218 if (PyUnicode_IS_ASCII(self))
12219 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012220 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221}
12222
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012223#define LEFTSTRIP 0
12224#define RIGHTSTRIP 1
12225#define BOTHSTRIP 2
12226
12227/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012228static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012229
INADA Naoki3ae20562017-01-16 20:41:20 +090012230#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012231
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012232/* externally visible for str.strip(unicode) */
12233PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012234_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 void *data;
12237 int kind;
12238 Py_ssize_t i, j, len;
12239 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012240 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12243 return NULL;
12244
12245 kind = PyUnicode_KIND(self);
12246 data = PyUnicode_DATA(self);
12247 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012248 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12250 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012251 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012252
Benjamin Peterson14339b62009-01-31 16:36:08 +000012253 i = 0;
12254 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012255 while (i < len) {
12256 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12257 if (!BLOOM(sepmask, ch))
12258 break;
12259 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12260 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 i++;
12262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012263 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012264
Benjamin Peterson14339b62009-01-31 16:36:08 +000012265 j = len;
12266 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012267 j--;
12268 while (j >= i) {
12269 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12270 if (!BLOOM(sepmask, ch))
12271 break;
12272 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12273 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012275 }
12276
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012279
Victor Stinner7931d9a2011-11-04 00:22:48 +010012280 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281}
12282
12283PyObject*
12284PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12285{
12286 unsigned char *data;
12287 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012288 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289
Victor Stinnerde636f32011-10-01 03:55:54 +020012290 if (PyUnicode_READY(self) == -1)
12291 return NULL;
12292
Victor Stinner684d5fd2012-05-03 02:32:34 +020012293 length = PyUnicode_GET_LENGTH(self);
12294 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012295
Victor Stinner684d5fd2012-05-03 02:32:34 +020012296 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012297 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298
Victor Stinnerde636f32011-10-01 03:55:54 +020012299 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012300 PyErr_SetString(PyExc_IndexError, "string index out of range");
12301 return NULL;
12302 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012303 if (start >= length || end < start)
12304 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012305
Victor Stinner684d5fd2012-05-03 02:32:34 +020012306 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012307 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012308 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012309 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012310 }
12311 else {
12312 kind = PyUnicode_KIND(self);
12313 data = PyUnicode_1BYTE_DATA(self);
12314 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012315 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012316 length);
12317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319
12320static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012321do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 Py_ssize_t len, i, j;
12324
12325 if (PyUnicode_READY(self) == -1)
12326 return NULL;
12327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012329
Victor Stinnercc7af722013-04-09 22:39:24 +020012330 if (PyUnicode_IS_ASCII(self)) {
12331 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12332
12333 i = 0;
12334 if (striptype != RIGHTSTRIP) {
12335 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012336 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012337 if (!_Py_ascii_whitespace[ch])
12338 break;
12339 i++;
12340 }
12341 }
12342
12343 j = len;
12344 if (striptype != LEFTSTRIP) {
12345 j--;
12346 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012347 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012348 if (!_Py_ascii_whitespace[ch])
12349 break;
12350 j--;
12351 }
12352 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012353 }
12354 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012355 else {
12356 int kind = PyUnicode_KIND(self);
12357 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358
Victor Stinnercc7af722013-04-09 22:39:24 +020012359 i = 0;
12360 if (striptype != RIGHTSTRIP) {
12361 while (i < len) {
12362 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12363 if (!Py_UNICODE_ISSPACE(ch))
12364 break;
12365 i++;
12366 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012367 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012368
12369 j = len;
12370 if (striptype != LEFTSTRIP) {
12371 j--;
12372 while (j >= i) {
12373 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12374 if (!Py_UNICODE_ISSPACE(ch))
12375 break;
12376 j--;
12377 }
12378 j++;
12379 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012380 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012381
Victor Stinner7931d9a2011-11-04 00:22:48 +010012382 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383}
12384
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012385
12386static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012387do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012389 if (sep != NULL && sep != Py_None) {
12390 if (PyUnicode_Check(sep))
12391 return _PyUnicode_XStrip(self, striptype, sep);
12392 else {
12393 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 "%s arg must be None or str",
12395 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012396 return NULL;
12397 }
12398 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012399
Benjamin Peterson14339b62009-01-31 16:36:08 +000012400 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012401}
12402
12403
INADA Naoki3ae20562017-01-16 20:41:20 +090012404/*[clinic input]
12405str.strip as unicode_strip
12406
12407 chars: object = None
12408 /
12409
Victor Stinner0c4a8282017-01-17 02:21:47 +010012410Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012411
12412If chars is given and not None, remove characters in chars instead.
12413[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012414
12415static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012416unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012417/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012418{
INADA Naoki3ae20562017-01-16 20:41:20 +090012419 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420}
12421
12422
INADA Naoki3ae20562017-01-16 20:41:20 +090012423/*[clinic input]
12424str.lstrip as unicode_lstrip
12425
12426 chars: object = NULL
12427 /
12428
12429Return a copy of the string with leading whitespace removed.
12430
12431If chars is given and not None, remove characters in chars instead.
12432[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433
12434static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012435unicode_lstrip_impl(PyObject *self, PyObject *chars)
12436/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012437{
INADA Naoki3ae20562017-01-16 20:41:20 +090012438 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012439}
12440
12441
INADA Naoki3ae20562017-01-16 20:41:20 +090012442/*[clinic input]
12443str.rstrip as unicode_rstrip
12444
12445 chars: object = NULL
12446 /
12447
12448Return a copy of the string with trailing whitespace removed.
12449
12450If chars is given and not None, remove characters in chars instead.
12451[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012452
12453static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012454unicode_rstrip_impl(PyObject *self, PyObject *chars)
12455/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456{
INADA Naoki3ae20562017-01-16 20:41:20 +090012457 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458}
12459
12460
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012462unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012464 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
Serhiy Storchaka05997252013-01-26 12:14:02 +020012467 if (len < 1)
12468 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
Victor Stinnerc4b49542011-12-11 22:44:26 +010012470 /* no repeat, return original string */
12471 if (len == 1)
12472 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012473
Benjamin Petersonbac79492012-01-14 13:34:47 -050012474 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 return NULL;
12476
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012477 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012478 PyErr_SetString(PyExc_OverflowError,
12479 "repeated string is too long");
12480 return NULL;
12481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012483
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012484 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485 if (!u)
12486 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012487 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 if (PyUnicode_GET_LENGTH(str) == 1) {
12490 const int kind = PyUnicode_KIND(str);
12491 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012492 if (kind == PyUnicode_1BYTE_KIND) {
12493 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012494 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012495 }
12496 else if (kind == PyUnicode_2BYTE_KIND) {
12497 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012498 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012499 ucs2[n] = fill_char;
12500 } else {
12501 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12502 assert(kind == PyUnicode_4BYTE_KIND);
12503 for (n = 0; n < len; ++n)
12504 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 }
12507 else {
12508 /* number of characters copied this far */
12509 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012510 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012512 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012516 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012517 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519 }
12520
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012521 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012522 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523}
12524
Alexander Belopolsky40018472011-02-26 01:02:56 +000012525PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012526PyUnicode_Replace(PyObject *str,
12527 PyObject *substr,
12528 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012529 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012531 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12532 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012534 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
12536
INADA Naoki3ae20562017-01-16 20:41:20 +090012537/*[clinic input]
12538str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
INADA Naoki3ae20562017-01-16 20:41:20 +090012540 old: unicode
12541 new: unicode
12542 count: Py_ssize_t = -1
12543 Maximum number of occurrences to replace.
12544 -1 (the default value) means replace all occurrences.
12545 /
12546
12547Return a copy with all occurrences of substring old replaced by new.
12548
12549If the optional argument count is given, only the first count occurrences are
12550replaced.
12551[clinic start generated code]*/
12552
12553static PyObject *
12554unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12555 Py_ssize_t count)
12556/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012558 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012560 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561}
12562
Alexander Belopolsky40018472011-02-26 01:02:56 +000012563static PyObject *
12564unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012566 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 Py_ssize_t isize;
12568 Py_ssize_t osize, squote, dquote, i, o;
12569 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012570 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012574 return NULL;
12575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 isize = PyUnicode_GET_LENGTH(unicode);
12577 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 /* Compute length of output, quote characters, and
12580 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012581 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 max = 127;
12583 squote = dquote = 0;
12584 ikind = PyUnicode_KIND(unicode);
12585 for (i = 0; i < isize; i++) {
12586 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012587 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012589 case '\'': squote++; break;
12590 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012592 incr = 2;
12593 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 default:
12595 /* Fast-path ASCII */
12596 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012597 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012599 ;
12600 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012603 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012605 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012607 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012609 if (osize > PY_SSIZE_T_MAX - incr) {
12610 PyErr_SetString(PyExc_OverflowError,
12611 "string is too long to generate repr");
12612 return NULL;
12613 }
12614 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 }
12616
12617 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012618 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012620 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 if (dquote)
12622 /* Both squote and dquote present. Use squote,
12623 and escape them */
12624 osize += squote;
12625 else
12626 quote = '"';
12627 }
Victor Stinner55c08782013-04-14 18:45:39 +020012628 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629
12630 repr = PyUnicode_New(osize, max);
12631 if (repr == NULL)
12632 return NULL;
12633 okind = PyUnicode_KIND(repr);
12634 odata = PyUnicode_DATA(repr);
12635
12636 PyUnicode_WRITE(okind, odata, 0, quote);
12637 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012638 if (unchanged) {
12639 _PyUnicode_FastCopyCharacters(repr, 1,
12640 unicode, 0,
12641 isize);
12642 }
12643 else {
12644 for (i = 0, o = 1; i < isize; i++) {
12645 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646
Victor Stinner55c08782013-04-14 18:45:39 +020012647 /* Escape quotes and backslashes */
12648 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012649 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012651 continue;
12652 }
12653
12654 /* Map special whitespace to '\t', \n', '\r' */
12655 if (ch == '\t') {
12656 PyUnicode_WRITE(okind, odata, o++, '\\');
12657 PyUnicode_WRITE(okind, odata, o++, 't');
12658 }
12659 else if (ch == '\n') {
12660 PyUnicode_WRITE(okind, odata, o++, '\\');
12661 PyUnicode_WRITE(okind, odata, o++, 'n');
12662 }
12663 else if (ch == '\r') {
12664 PyUnicode_WRITE(okind, odata, o++, '\\');
12665 PyUnicode_WRITE(okind, odata, o++, 'r');
12666 }
12667
12668 /* Map non-printable US ASCII to '\xhh' */
12669 else if (ch < ' ' || ch == 0x7F) {
12670 PyUnicode_WRITE(okind, odata, o++, '\\');
12671 PyUnicode_WRITE(okind, odata, o++, 'x');
12672 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12673 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12674 }
12675
12676 /* Copy ASCII characters as-is */
12677 else if (ch < 0x7F) {
12678 PyUnicode_WRITE(okind, odata, o++, ch);
12679 }
12680
12681 /* Non-ASCII characters */
12682 else {
12683 /* Map Unicode whitespace and control characters
12684 (categories Z* and C* except ASCII space)
12685 */
12686 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12687 PyUnicode_WRITE(okind, odata, o++, '\\');
12688 /* Map 8-bit characters to '\xhh' */
12689 if (ch <= 0xff) {
12690 PyUnicode_WRITE(okind, odata, o++, 'x');
12691 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12692 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12693 }
12694 /* Map 16-bit characters to '\uxxxx' */
12695 else if (ch <= 0xffff) {
12696 PyUnicode_WRITE(okind, odata, o++, 'u');
12697 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12698 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12699 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12700 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12701 }
12702 /* Map 21-bit characters to '\U00xxxxxx' */
12703 else {
12704 PyUnicode_WRITE(okind, odata, o++, 'U');
12705 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12706 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12713 }
12714 }
12715 /* Copy characters as-is */
12716 else {
12717 PyUnicode_WRITE(okind, odata, o++, ch);
12718 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012719 }
12720 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012723 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012724 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725}
12726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012727PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729\n\
12730Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012731such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732arguments start and end are interpreted as in slice notation.\n\
12733\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012734Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735
12736static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012739 /* initialize variables to prevent gcc warning */
12740 PyObject *substring = NULL;
12741 Py_ssize_t start = 0;
12742 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012745 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012748 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012751 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 if (result == -2)
12754 return NULL;
12755
Christian Heimes217cfd12007-12-02 14:31:20 +000012756 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757}
12758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012759PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012762Return the highest index in S where substring sub is found,\n\
12763such that sub is contained within S[start:end]. Optional\n\
12764arguments start and end are interpreted as in slice notation.\n\
12765\n\
12766Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767
12768static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012771 /* initialize variables to prevent gcc warning */
12772 PyObject *substring = NULL;
12773 Py_ssize_t start = 0;
12774 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012777 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012780 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012783 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 if (result == -2)
12786 return NULL;
12787
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788 if (result < 0) {
12789 PyErr_SetString(PyExc_ValueError, "substring not found");
12790 return NULL;
12791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792
Christian Heimes217cfd12007-12-02 14:31:20 +000012793 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794}
12795
INADA Naoki3ae20562017-01-16 20:41:20 +090012796/*[clinic input]
12797str.rjust as unicode_rjust
12798
12799 width: Py_ssize_t
12800 fillchar: Py_UCS4 = ' '
12801 /
12802
12803Return a right-justified string of length width.
12804
12805Padding is done using the specified fill character (default is a space).
12806[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
12808static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012809unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12810/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012812 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813 return NULL;
12814
Victor Stinnerc4b49542011-12-11 22:44:26 +010012815 if (PyUnicode_GET_LENGTH(self) >= width)
12816 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
Victor Stinnerc4b49542011-12-11 22:44:26 +010012818 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819}
12820
Alexander Belopolsky40018472011-02-26 01:02:56 +000012821PyObject *
12822PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012824 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012827 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828}
12829
INADA Naoki3ae20562017-01-16 20:41:20 +090012830/*[clinic input]
12831str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832
INADA Naoki3ae20562017-01-16 20:41:20 +090012833 sep: object = None
12834 The delimiter according which to split the string.
12835 None (the default value) means split according to any whitespace,
12836 and discard empty strings from the result.
12837 maxsplit: Py_ssize_t = -1
12838 Maximum number of splits to do.
12839 -1 (the default value) means no limit.
12840
12841Return a list of the words in the string, using sep as the delimiter string.
12842[clinic start generated code]*/
12843
12844static PyObject *
12845unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12846/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847{
INADA Naoki3ae20562017-01-16 20:41:20 +090012848 if (sep == Py_None)
12849 return split(self, NULL, maxsplit);
12850 if (PyUnicode_Check(sep))
12851 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012852
Victor Stinner998b8062018-09-12 00:23:25 +020012853 PyErr_Format(PyExc_TypeError,
12854 "must be str or None, not %.100s",
12855 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857}
12858
Thomas Wouters477c8d52006-05-27 19:21:47 +000012859PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012860PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012861{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012863 int kind1, kind2;
12864 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012866
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012867 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012869
Victor Stinner14f8f022011-10-05 20:58:25 +020012870 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 len1 = PyUnicode_GET_LENGTH(str_obj);
12873 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012874 if (kind1 < kind2 || len1 < len2) {
12875 _Py_INCREF_UNICODE_EMPTY();
12876 if (!unicode_empty)
12877 out = NULL;
12878 else {
12879 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12880 Py_DECREF(unicode_empty);
12881 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012882 return out;
12883 }
12884 buf1 = PyUnicode_DATA(str_obj);
12885 buf2 = PyUnicode_DATA(sep_obj);
12886 if (kind2 != kind1) {
12887 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12888 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012889 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012892 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012894 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12895 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12896 else
12897 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 break;
12899 case PyUnicode_2BYTE_KIND:
12900 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12901 break;
12902 case PyUnicode_4BYTE_KIND:
12903 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12904 break;
12905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012906 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012909 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911
12912 return out;
12913}
12914
12915
12916PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012917PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012918{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012920 int kind1, kind2;
12921 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012926
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012927 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 len1 = PyUnicode_GET_LENGTH(str_obj);
12930 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012931 if (kind1 < kind2 || len1 < len2) {
12932 _Py_INCREF_UNICODE_EMPTY();
12933 if (!unicode_empty)
12934 out = NULL;
12935 else {
12936 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12937 Py_DECREF(unicode_empty);
12938 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012939 return out;
12940 }
12941 buf1 = PyUnicode_DATA(str_obj);
12942 buf2 = PyUnicode_DATA(sep_obj);
12943 if (kind2 != kind1) {
12944 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12945 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012946 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012951 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12952 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12953 else
12954 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 break;
12956 case PyUnicode_2BYTE_KIND:
12957 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12958 break;
12959 case PyUnicode_4BYTE_KIND:
12960 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12961 break;
12962 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012963 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012965
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012966 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968
12969 return out;
12970}
12971
INADA Naoki3ae20562017-01-16 20:41:20 +090012972/*[clinic input]
12973str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974
INADA Naoki3ae20562017-01-16 20:41:20 +090012975 sep: object
12976 /
12977
12978Partition the string into three parts using the given separator.
12979
12980This will search for the separator in the string. If the separator is found,
12981returns a 3-tuple containing the part before the separator, the separator
12982itself, and the part after it.
12983
12984If the separator is not found, returns a 3-tuple containing the original string
12985and two empty strings.
12986[clinic start generated code]*/
12987
12988static PyObject *
12989unicode_partition(PyObject *self, PyObject *sep)
12990/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991{
INADA Naoki3ae20562017-01-16 20:41:20 +090012992 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993}
12994
INADA Naoki3ae20562017-01-16 20:41:20 +090012995/*[clinic input]
12996str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012997
INADA Naoki3ae20562017-01-16 20:41:20 +090012998Partition the string into three parts using the given separator.
12999
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013000This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013001the separator is found, returns a 3-tuple containing the part before the
13002separator, the separator itself, and the part after it.
13003
13004If the separator is not found, returns a 3-tuple containing two empty strings
13005and the original string.
13006[clinic start generated code]*/
13007
13008static PyObject *
13009unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013010/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013011{
INADA Naoki3ae20562017-01-16 20:41:20 +090013012 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013}
13014
Alexander Belopolsky40018472011-02-26 01:02:56 +000013015PyObject *
13016PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013017{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013018 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013019 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013020
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013021 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013022}
13023
INADA Naoki3ae20562017-01-16 20:41:20 +090013024/*[clinic input]
13025str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013026
INADA Naoki3ae20562017-01-16 20:41:20 +090013027Return a list of the words in the string, using sep as the delimiter string.
13028
13029Splits are done starting at the end of the string and working to the front.
13030[clinic start generated code]*/
13031
13032static PyObject *
13033unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13034/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013035{
INADA Naoki3ae20562017-01-16 20:41:20 +090013036 if (sep == Py_None)
13037 return rsplit(self, NULL, maxsplit);
13038 if (PyUnicode_Check(sep))
13039 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013040
Victor Stinner998b8062018-09-12 00:23:25 +020013041 PyErr_Format(PyExc_TypeError,
13042 "must be str or None, not %.100s",
13043 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013044 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013045}
13046
INADA Naoki3ae20562017-01-16 20:41:20 +090013047/*[clinic input]
13048str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013050 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013051
13052Return a list of the lines in the string, breaking at line boundaries.
13053
13054Line breaks are not included in the resulting list unless keepends is given and
13055true.
13056[clinic start generated code]*/
13057
13058static PyObject *
13059unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013060/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013062 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063}
13064
13065static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013066PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013068 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069}
13070
INADA Naoki3ae20562017-01-16 20:41:20 +090013071/*[clinic input]
13072str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
INADA Naoki3ae20562017-01-16 20:41:20 +090013074Convert uppercase characters to lowercase and lowercase characters to uppercase.
13075[clinic start generated code]*/
13076
13077static PyObject *
13078unicode_swapcase_impl(PyObject *self)
13079/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013081 if (PyUnicode_READY(self) == -1)
13082 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013083 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084}
13085
Larry Hastings61272b72014-01-07 12:41:53 -080013086/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013087
Larry Hastings31826802013-10-19 00:09:25 -070013088@staticmethod
13089str.maketrans as unicode_maketrans
13090
13091 x: object
13092
13093 y: unicode=NULL
13094
13095 z: unicode=NULL
13096
13097 /
13098
13099Return a translation table usable for str.translate().
13100
13101If there is only one argument, it must be a dictionary mapping Unicode
13102ordinals (integers) or characters to Unicode ordinals, strings or None.
13103Character keys will be then converted to ordinals.
13104If there are two arguments, they must be strings of equal length, and
13105in the resulting dictionary, each character in x will be mapped to the
13106character at the same position in y. If there is a third argument, it
13107must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013108[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013109
Larry Hastings31826802013-10-19 00:09:25 -070013110static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013111unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013112/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013113{
Georg Brandlceee0772007-11-27 23:48:05 +000013114 PyObject *new = NULL, *key, *value;
13115 Py_ssize_t i = 0;
13116 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117
Georg Brandlceee0772007-11-27 23:48:05 +000013118 new = PyDict_New();
13119 if (!new)
13120 return NULL;
13121 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 int x_kind, y_kind, z_kind;
13123 void *x_data, *y_data, *z_data;
13124
Georg Brandlceee0772007-11-27 23:48:05 +000013125 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013126 if (!PyUnicode_Check(x)) {
13127 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13128 "be a string if there is a second argument");
13129 goto err;
13130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013132 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13133 "arguments must have equal length");
13134 goto err;
13135 }
13136 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 x_kind = PyUnicode_KIND(x);
13138 y_kind = PyUnicode_KIND(y);
13139 x_data = PyUnicode_DATA(x);
13140 y_data = PyUnicode_DATA(y);
13141 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13142 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013143 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013144 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013145 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013146 if (!value) {
13147 Py_DECREF(key);
13148 goto err;
13149 }
Georg Brandlceee0772007-11-27 23:48:05 +000013150 res = PyDict_SetItem(new, key, value);
13151 Py_DECREF(key);
13152 Py_DECREF(value);
13153 if (res < 0)
13154 goto err;
13155 }
13156 /* create entries for deleting chars in z */
13157 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 z_kind = PyUnicode_KIND(z);
13159 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013160 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013162 if (!key)
13163 goto err;
13164 res = PyDict_SetItem(new, key, Py_None);
13165 Py_DECREF(key);
13166 if (res < 0)
13167 goto err;
13168 }
13169 }
13170 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 int kind;
13172 void *data;
13173
Georg Brandlceee0772007-11-27 23:48:05 +000013174 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013175 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013176 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13177 "to maketrans it must be a dict");
13178 goto err;
13179 }
13180 /* copy entries into the new dict, converting string keys to int keys */
13181 while (PyDict_Next(x, &i, &key, &value)) {
13182 if (PyUnicode_Check(key)) {
13183 /* convert string keys to integer keys */
13184 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013185 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013186 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13187 "table must be of length 1");
13188 goto err;
13189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 kind = PyUnicode_KIND(key);
13191 data = PyUnicode_DATA(key);
13192 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013193 if (!newkey)
13194 goto err;
13195 res = PyDict_SetItem(new, newkey, value);
13196 Py_DECREF(newkey);
13197 if (res < 0)
13198 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013199 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013200 /* just keep integer keys */
13201 if (PyDict_SetItem(new, key, value) < 0)
13202 goto err;
13203 } else {
13204 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13205 "be strings or integers");
13206 goto err;
13207 }
13208 }
13209 }
13210 return new;
13211 err:
13212 Py_DECREF(new);
13213 return NULL;
13214}
13215
INADA Naoki3ae20562017-01-16 20:41:20 +090013216/*[clinic input]
13217str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218
INADA Naoki3ae20562017-01-16 20:41:20 +090013219 table: object
13220 Translation table, which must be a mapping of Unicode ordinals to
13221 Unicode ordinals, strings, or None.
13222 /
13223
13224Replace each character in the string using the given translation table.
13225
13226The table must implement lookup/indexing via __getitem__, for instance a
13227dictionary or list. If this operation raises LookupError, the character is
13228left untouched. Characters mapped to None are deleted.
13229[clinic start generated code]*/
13230
13231static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013233/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236}
13237
INADA Naoki3ae20562017-01-16 20:41:20 +090013238/*[clinic input]
13239str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240
INADA Naoki3ae20562017-01-16 20:41:20 +090013241Return a copy of the string converted to uppercase.
13242[clinic start generated code]*/
13243
13244static PyObject *
13245unicode_upper_impl(PyObject *self)
13246/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013248 if (PyUnicode_READY(self) == -1)
13249 return NULL;
13250 if (PyUnicode_IS_ASCII(self))
13251 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013252 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253}
13254
INADA Naoki3ae20562017-01-16 20:41:20 +090013255/*[clinic input]
13256str.zfill as unicode_zfill
13257
13258 width: Py_ssize_t
13259 /
13260
13261Pad a numeric string with zeros on the left, to fill a field of the given width.
13262
13263The string is never truncated.
13264[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265
13266static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013267unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013268/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013270 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013271 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 int kind;
13273 void *data;
13274 Py_UCS4 chr;
13275
Benjamin Petersonbac79492012-01-14 13:34:47 -050013276 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
Victor Stinnerc4b49542011-12-11 22:44:26 +010013279 if (PyUnicode_GET_LENGTH(self) >= width)
13280 return unicode_result_unchanged(self);
13281
13282 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283
13284 u = pad(self, fill, 0, '0');
13285
Walter Dörwald068325e2002-04-15 13:36:47 +000013286 if (u == NULL)
13287 return NULL;
13288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289 kind = PyUnicode_KIND(u);
13290 data = PyUnicode_DATA(u);
13291 chr = PyUnicode_READ(kind, data, fill);
13292
13293 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 PyUnicode_WRITE(kind, data, 0, chr);
13296 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297 }
13298
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013299 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013300 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
13303#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013304static PyObject *
13305unicode__decimal2ascii(PyObject *self)
13306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013308}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309#endif
13310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013311PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013314Return True if S starts with the specified prefix, False otherwise.\n\
13315With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013316With optional end, stop comparing S at that position.\n\
13317prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318
13319static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013320unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013323 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013324 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013325 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013326 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013327 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328
Jesus Ceaac451502011-04-20 17:09:23 +020013329 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013331 if (PyTuple_Check(subobj)) {
13332 Py_ssize_t i;
13333 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013334 substring = PyTuple_GET_ITEM(subobj, i);
13335 if (!PyUnicode_Check(substring)) {
13336 PyErr_Format(PyExc_TypeError,
13337 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013338 "not %.100s",
13339 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013340 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013341 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013342 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013343 if (result == -1)
13344 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013345 if (result) {
13346 Py_RETURN_TRUE;
13347 }
13348 }
13349 /* nothing matched */
13350 Py_RETURN_FALSE;
13351 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013352 if (!PyUnicode_Check(subobj)) {
13353 PyErr_Format(PyExc_TypeError,
13354 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013355 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013357 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013358 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013359 if (result == -1)
13360 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013361 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362}
13363
13364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013365PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013368Return True if S ends with the specified suffix, False otherwise.\n\
13369With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013370With optional end, stop comparing S at that position.\n\
13371suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
13373static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013374unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013377 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013378 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013379 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013380 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013381 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382
Jesus Ceaac451502011-04-20 17:09:23 +020013383 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013385 if (PyTuple_Check(subobj)) {
13386 Py_ssize_t i;
13387 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013388 substring = PyTuple_GET_ITEM(subobj, i);
13389 if (!PyUnicode_Check(substring)) {
13390 PyErr_Format(PyExc_TypeError,
13391 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013392 "not %.100s",
13393 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013395 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013396 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013397 if (result == -1)
13398 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013399 if (result) {
13400 Py_RETURN_TRUE;
13401 }
13402 }
13403 Py_RETURN_FALSE;
13404 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013405 if (!PyUnicode_Check(subobj)) {
13406 PyErr_Format(PyExc_TypeError,
13407 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013408 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013410 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013411 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013412 if (result == -1)
13413 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013414 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415}
13416
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013417static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013418_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013419{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013420 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13421 writer->data = PyUnicode_DATA(writer->buffer);
13422
13423 if (!writer->readonly) {
13424 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013425 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013426 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013427 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013428 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13429 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13430 writer->kind = PyUnicode_WCHAR_KIND;
13431 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13432
Victor Stinner8f674cc2013-04-17 23:02:17 +020013433 /* Copy-on-write mode: set buffer size to 0 so
13434 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13435 * next write. */
13436 writer->size = 0;
13437 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013438}
13439
Victor Stinnerd3f08822012-05-29 12:57:52 +020013440void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013441_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013442{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013443 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013444
13445 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013446 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013447
13448 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13449 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13450 writer->kind = PyUnicode_WCHAR_KIND;
13451 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013452}
13453
Victor Stinnerd3f08822012-05-29 12:57:52 +020013454int
13455_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13456 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013457{
13458 Py_ssize_t newlen;
13459 PyObject *newbuffer;
13460
Victor Stinner2740e462016-09-06 16:58:36 -070013461 assert(maxchar <= MAX_UNICODE);
13462
Victor Stinnerca9381e2015-09-22 00:58:32 +020013463 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013464 assert((maxchar > writer->maxchar && length >= 0)
13465 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013466
Victor Stinner202fdca2012-05-07 12:47:02 +020013467 if (length > PY_SSIZE_T_MAX - writer->pos) {
13468 PyErr_NoMemory();
13469 return -1;
13470 }
13471 newlen = writer->pos + length;
13472
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013473 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013474
Victor Stinnerd3f08822012-05-29 12:57:52 +020013475 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013476 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013477 if (writer->overallocate
13478 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13479 /* overallocate to limit the number of realloc() */
13480 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013481 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013482 if (newlen < writer->min_length)
13483 newlen = writer->min_length;
13484
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485 writer->buffer = PyUnicode_New(newlen, maxchar);
13486 if (writer->buffer == NULL)
13487 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013489 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013490 if (writer->overallocate
13491 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13492 /* overallocate to limit the number of realloc() */
13493 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013495 if (newlen < writer->min_length)
13496 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013497
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013498 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013499 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013500 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013501 newbuffer = PyUnicode_New(newlen, maxchar);
13502 if (newbuffer == NULL)
13503 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13505 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013507 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013508 }
13509 else {
13510 newbuffer = resize_compact(writer->buffer, newlen);
13511 if (newbuffer == NULL)
13512 return -1;
13513 }
13514 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013515 }
13516 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013517 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013518 newbuffer = PyUnicode_New(writer->size, maxchar);
13519 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013520 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13522 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013523 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013524 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013525 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013526 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013527
13528#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013529}
13530
Victor Stinnerca9381e2015-09-22 00:58:32 +020013531int
13532_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13533 enum PyUnicode_Kind kind)
13534{
13535 Py_UCS4 maxchar;
13536
13537 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13538 assert(writer->kind < kind);
13539
13540 switch (kind)
13541 {
13542 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13543 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13544 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13545 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013546 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013547 }
13548
13549 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13550}
13551
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013552static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013553_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013554{
Victor Stinner2740e462016-09-06 16:58:36 -070013555 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013556 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13557 return -1;
13558 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13559 writer->pos++;
13560 return 0;
13561}
13562
13563int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013564_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13565{
13566 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13567}
13568
13569int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13571{
13572 Py_UCS4 maxchar;
13573 Py_ssize_t len;
13574
13575 if (PyUnicode_READY(str) == -1)
13576 return -1;
13577 len = PyUnicode_GET_LENGTH(str);
13578 if (len == 0)
13579 return 0;
13580 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13581 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013582 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013583 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013584 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013585 Py_INCREF(str);
13586 writer->buffer = str;
13587 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588 writer->pos += len;
13589 return 0;
13590 }
13591 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13592 return -1;
13593 }
13594 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13595 str, 0, len);
13596 writer->pos += len;
13597 return 0;
13598}
13599
Victor Stinnere215d962012-10-06 23:03:36 +020013600int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013601_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13602 Py_ssize_t start, Py_ssize_t end)
13603{
13604 Py_UCS4 maxchar;
13605 Py_ssize_t len;
13606
13607 if (PyUnicode_READY(str) == -1)
13608 return -1;
13609
13610 assert(0 <= start);
13611 assert(end <= PyUnicode_GET_LENGTH(str));
13612 assert(start <= end);
13613
13614 if (end == 0)
13615 return 0;
13616
13617 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13618 return _PyUnicodeWriter_WriteStr(writer, str);
13619
13620 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13621 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13622 else
13623 maxchar = writer->maxchar;
13624 len = end - start;
13625
13626 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13627 return -1;
13628
13629 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13630 str, start, len);
13631 writer->pos += len;
13632 return 0;
13633}
13634
13635int
Victor Stinner4a587072013-11-19 12:54:53 +010013636_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13637 const char *ascii, Py_ssize_t len)
13638{
13639 if (len == -1)
13640 len = strlen(ascii);
13641
13642 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13643
13644 if (writer->buffer == NULL && !writer->overallocate) {
13645 PyObject *str;
13646
13647 str = _PyUnicode_FromASCII(ascii, len);
13648 if (str == NULL)
13649 return -1;
13650
13651 writer->readonly = 1;
13652 writer->buffer = str;
13653 _PyUnicodeWriter_Update(writer);
13654 writer->pos += len;
13655 return 0;
13656 }
13657
13658 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13659 return -1;
13660
13661 switch (writer->kind)
13662 {
13663 case PyUnicode_1BYTE_KIND:
13664 {
13665 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13666 Py_UCS1 *data = writer->data;
13667
Christian Heimesf051e432016-09-13 20:22:02 +020013668 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013669 break;
13670 }
13671 case PyUnicode_2BYTE_KIND:
13672 {
13673 _PyUnicode_CONVERT_BYTES(
13674 Py_UCS1, Py_UCS2,
13675 ascii, ascii + len,
13676 (Py_UCS2 *)writer->data + writer->pos);
13677 break;
13678 }
13679 case PyUnicode_4BYTE_KIND:
13680 {
13681 _PyUnicode_CONVERT_BYTES(
13682 Py_UCS1, Py_UCS4,
13683 ascii, ascii + len,
13684 (Py_UCS4 *)writer->data + writer->pos);
13685 break;
13686 }
13687 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013688 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013689 }
13690
13691 writer->pos += len;
13692 return 0;
13693}
13694
13695int
13696_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13697 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013698{
13699 Py_UCS4 maxchar;
13700
13701 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13702 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13703 return -1;
13704 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13705 writer->pos += len;
13706 return 0;
13707}
13708
Victor Stinnerd3f08822012-05-29 12:57:52 +020013709PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013710_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013711{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013712 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013713
Victor Stinnerd3f08822012-05-29 12:57:52 +020013714 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013715 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013716 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013717 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013718
13719 str = writer->buffer;
13720 writer->buffer = NULL;
13721
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013722 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013723 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13724 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013725 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013726
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013727 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13728 PyObject *str2;
13729 str2 = resize_compact(str, writer->pos);
13730 if (str2 == NULL) {
13731 Py_DECREF(str);
13732 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013733 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013734 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013735 }
13736
Victor Stinner15a0bd32013-07-08 22:29:55 +020013737 assert(_PyUnicode_CheckConsistency(str, 1));
13738 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013739}
13740
Victor Stinnerd3f08822012-05-29 12:57:52 +020013741void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013742_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013743{
13744 Py_CLEAR(writer->buffer);
13745}
13746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013747#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013748
13749PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013751\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013752Return a formatted version of S, using substitutions from args and kwargs.\n\
13753The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013754
Eric Smith27bbca62010-11-04 17:06:58 +000013755PyDoc_STRVAR(format_map__doc__,
13756 "S.format_map(mapping) -> str\n\
13757\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013758Return a formatted version of S, using substitutions from mapping.\n\
13759The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013760
INADA Naoki3ae20562017-01-16 20:41:20 +090013761/*[clinic input]
13762str.__format__ as unicode___format__
13763
13764 format_spec: unicode
13765 /
13766
13767Return a formatted version of the string as described by format_spec.
13768[clinic start generated code]*/
13769
Eric Smith4a7d76d2008-05-30 18:10:19 +000013770static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013771unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013772/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013773{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013774 _PyUnicodeWriter writer;
13775 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013776
Victor Stinnerd3f08822012-05-29 12:57:52 +020013777 if (PyUnicode_READY(self) == -1)
13778 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013779 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013780 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13781 self, format_spec, 0,
13782 PyUnicode_GET_LENGTH(format_spec));
13783 if (ret == -1) {
13784 _PyUnicodeWriter_Dealloc(&writer);
13785 return NULL;
13786 }
13787 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013788}
13789
INADA Naoki3ae20562017-01-16 20:41:20 +090013790/*[clinic input]
13791str.__sizeof__ as unicode_sizeof
13792
13793Return the size of the string in memory, in bytes.
13794[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013795
13796static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013797unicode_sizeof_impl(PyObject *self)
13798/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800 Py_ssize_t size;
13801
13802 /* If it's a compact object, account for base structure +
13803 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013804 if (PyUnicode_IS_COMPACT_ASCII(self))
13805 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13806 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013807 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013808 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013809 else {
13810 /* If it is a two-block object, account for base object, and
13811 for character block if present. */
13812 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013813 if (_PyUnicode_DATA_ANY(self))
13814 size += (PyUnicode_GET_LENGTH(self) + 1) *
13815 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013816 }
13817 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013818 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013819 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13820 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13821 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13822 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013823
13824 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013825}
13826
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013827static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013828unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013829{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013830 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013831 if (!copy)
13832 return NULL;
13833 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013834}
13835
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013837 UNICODE_ENCODE_METHODDEF
13838 UNICODE_REPLACE_METHODDEF
13839 UNICODE_SPLIT_METHODDEF
13840 UNICODE_RSPLIT_METHODDEF
13841 UNICODE_JOIN_METHODDEF
13842 UNICODE_CAPITALIZE_METHODDEF
13843 UNICODE_CASEFOLD_METHODDEF
13844 UNICODE_TITLE_METHODDEF
13845 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013846 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013847 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013848 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013849 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013850 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013851 UNICODE_LJUST_METHODDEF
13852 UNICODE_LOWER_METHODDEF
13853 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013854 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13855 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013856 UNICODE_RJUST_METHODDEF
13857 UNICODE_RSTRIP_METHODDEF
13858 UNICODE_RPARTITION_METHODDEF
13859 UNICODE_SPLITLINES_METHODDEF
13860 UNICODE_STRIP_METHODDEF
13861 UNICODE_SWAPCASE_METHODDEF
13862 UNICODE_TRANSLATE_METHODDEF
13863 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013864 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13865 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013866 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013867 UNICODE_ISLOWER_METHODDEF
13868 UNICODE_ISUPPER_METHODDEF
13869 UNICODE_ISTITLE_METHODDEF
13870 UNICODE_ISSPACE_METHODDEF
13871 UNICODE_ISDECIMAL_METHODDEF
13872 UNICODE_ISDIGIT_METHODDEF
13873 UNICODE_ISNUMERIC_METHODDEF
13874 UNICODE_ISALPHA_METHODDEF
13875 UNICODE_ISALNUM_METHODDEF
13876 UNICODE_ISIDENTIFIER_METHODDEF
13877 UNICODE_ISPRINTABLE_METHODDEF
13878 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013879 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013880 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013881 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013882 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013884#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013885 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013886 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013887#endif
13888
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013889 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013890 {NULL, NULL}
13891};
13892
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013893static PyObject *
13894unicode_mod(PyObject *v, PyObject *w)
13895{
Brian Curtindfc80e32011-08-10 20:28:54 -050013896 if (!PyUnicode_Check(v))
13897 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013899}
13900
13901static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 0, /*nb_add*/
13903 0, /*nb_subtract*/
13904 0, /*nb_multiply*/
13905 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013906};
13907
Guido van Rossumd57fd912000-03-10 22:53:23 +000013908static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013909 (lenfunc) unicode_length, /* sq_length */
13910 PyUnicode_Concat, /* sq_concat */
13911 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13912 (ssizeargfunc) unicode_getitem, /* sq_item */
13913 0, /* sq_slice */
13914 0, /* sq_ass_item */
13915 0, /* sq_ass_slice */
13916 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013917};
13918
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013919static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013920unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013922 if (PyUnicode_READY(self) == -1)
13923 return NULL;
13924
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013925 if (PyIndex_Check(item)) {
13926 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013927 if (i == -1 && PyErr_Occurred())
13928 return NULL;
13929 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013930 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013931 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013932 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013933 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013934 PyObject *result;
13935 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013936 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013937 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013938
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013939 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013940 return NULL;
13941 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013942 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13943 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013944
13945 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013946 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013948 slicelength == PyUnicode_GET_LENGTH(self)) {
13949 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013950 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013951 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013952 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013953 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013954 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013955 src_kind = PyUnicode_KIND(self);
13956 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013957 if (!PyUnicode_IS_ASCII(self)) {
13958 kind_limit = kind_maxchar_limit(src_kind);
13959 max_char = 0;
13960 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13961 ch = PyUnicode_READ(src_kind, src_data, cur);
13962 if (ch > max_char) {
13963 max_char = ch;
13964 if (max_char >= kind_limit)
13965 break;
13966 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013967 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013968 }
Victor Stinner55c99112011-10-13 01:17:06 +020013969 else
13970 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013971 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013972 if (result == NULL)
13973 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013974 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013975 dest_data = PyUnicode_DATA(result);
13976
13977 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013978 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13979 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013980 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013981 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013982 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983 } else {
13984 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13985 return NULL;
13986 }
13987}
13988
13989static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 (lenfunc)unicode_length, /* mp_length */
13991 (binaryfunc)unicode_subscript, /* mp_subscript */
13992 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013993};
13994
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995
Guido van Rossumd57fd912000-03-10 22:53:23 +000013996/* Helpers for PyUnicode_Format() */
13997
Victor Stinnera47082312012-10-04 02:19:54 +020013998struct unicode_formatter_t {
13999 PyObject *args;
14000 int args_owned;
14001 Py_ssize_t arglen, argidx;
14002 PyObject *dict;
14003
14004 enum PyUnicode_Kind fmtkind;
14005 Py_ssize_t fmtcnt, fmtpos;
14006 void *fmtdata;
14007 PyObject *fmtstr;
14008
14009 _PyUnicodeWriter writer;
14010};
14011
14012struct unicode_format_arg_t {
14013 Py_UCS4 ch;
14014 int flags;
14015 Py_ssize_t width;
14016 int prec;
14017 int sign;
14018};
14019
Guido van Rossumd57fd912000-03-10 22:53:23 +000014020static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014021unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014022{
Victor Stinnera47082312012-10-04 02:19:54 +020014023 Py_ssize_t argidx = ctx->argidx;
14024
14025 if (argidx < ctx->arglen) {
14026 ctx->argidx++;
14027 if (ctx->arglen < 0)
14028 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014029 else
Victor Stinnera47082312012-10-04 02:19:54 +020014030 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014031 }
14032 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014033 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014034 return NULL;
14035}
14036
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014037/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014038
Victor Stinnera47082312012-10-04 02:19:54 +020014039/* Format a float into the writer if the writer is not NULL, or into *p_output
14040 otherwise.
14041
14042 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014043static int
Victor Stinnera47082312012-10-04 02:19:54 +020014044formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14045 PyObject **p_output,
14046 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014047{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014048 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014049 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014050 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014051 int prec;
14052 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014053
Guido van Rossumd57fd912000-03-10 22:53:23 +000014054 x = PyFloat_AsDouble(v);
14055 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014056 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014057
Victor Stinnera47082312012-10-04 02:19:54 +020014058 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014060 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014061
Victor Stinnera47082312012-10-04 02:19:54 +020014062 if (arg->flags & F_ALT)
14063 dtoa_flags = Py_DTSF_ALT;
14064 else
14065 dtoa_flags = 0;
14066 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014067 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014068 return -1;
14069 len = strlen(p);
14070 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014071 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014072 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014073 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014074 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014075 }
14076 else
14077 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014078 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014079 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080}
14081
Victor Stinnerd0880d52012-04-27 23:40:13 +020014082/* formatlong() emulates the format codes d, u, o, x and X, and
14083 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14084 * Python's regular ints.
14085 * Return value: a new PyUnicodeObject*, or NULL if error.
14086 * The output string is of the form
14087 * "-"? ("0x" | "0X")? digit+
14088 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14089 * set in flags. The case of hex digits will be correct,
14090 * There will be at least prec digits, zero-filled on the left if
14091 * necessary to get that many.
14092 * val object to be converted
14093 * flags bitmask of format flags; only F_ALT is looked at
14094 * prec minimum number of digits; 0-fill on left if needed
14095 * type a character in [duoxX]; u acts the same as d
14096 *
14097 * CAUTION: o, x and X conversions on regular ints can never
14098 * produce a '-' sign, but can for Python's unbounded ints.
14099 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014100PyObject *
14101_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014102{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014103 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014105 Py_ssize_t i;
14106 int sign; /* 1 if '-', else 0 */
14107 int len; /* number of characters */
14108 Py_ssize_t llen;
14109 int numdigits; /* len == numnondigits + numdigits */
14110 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014111
Victor Stinnerd0880d52012-04-27 23:40:13 +020014112 /* Avoid exceeding SSIZE_T_MAX */
14113 if (prec > INT_MAX-3) {
14114 PyErr_SetString(PyExc_OverflowError,
14115 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014116 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014117 }
14118
14119 assert(PyLong_Check(val));
14120
14121 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014122 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014123 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014124 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014125 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014126 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014127 /* int and int subclasses should print numerically when a numeric */
14128 /* format code is used (see issue18780) */
14129 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014130 break;
14131 case 'o':
14132 numnondigits = 2;
14133 result = PyNumber_ToBase(val, 8);
14134 break;
14135 case 'x':
14136 case 'X':
14137 numnondigits = 2;
14138 result = PyNumber_ToBase(val, 16);
14139 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014140 }
14141 if (!result)
14142 return NULL;
14143
14144 assert(unicode_modifiable(result));
14145 assert(PyUnicode_IS_READY(result));
14146 assert(PyUnicode_IS_ASCII(result));
14147
14148 /* To modify the string in-place, there can only be one reference. */
14149 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014150 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151 PyErr_BadInternalCall();
14152 return NULL;
14153 }
14154 buf = PyUnicode_DATA(result);
14155 llen = PyUnicode_GET_LENGTH(result);
14156 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014157 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014159 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014160 return NULL;
14161 }
14162 len = (int)llen;
14163 sign = buf[0] == '-';
14164 numnondigits += sign;
14165 numdigits = len - numnondigits;
14166 assert(numdigits > 0);
14167
14168 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014169 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014170 (type == 'o' || type == 'x' || type == 'X'))) {
14171 assert(buf[sign] == '0');
14172 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14173 buf[sign+1] == 'o');
14174 numnondigits -= 2;
14175 buf += 2;
14176 len -= 2;
14177 if (sign)
14178 buf[0] = '-';
14179 assert(len == numnondigits + numdigits);
14180 assert(numdigits > 0);
14181 }
14182
14183 /* Fill with leading zeroes to meet minimum width. */
14184 if (prec > numdigits) {
14185 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14186 numnondigits + prec);
14187 char *b1;
14188 if (!r1) {
14189 Py_DECREF(result);
14190 return NULL;
14191 }
14192 b1 = PyBytes_AS_STRING(r1);
14193 for (i = 0; i < numnondigits; ++i)
14194 *b1++ = *buf++;
14195 for (i = 0; i < prec - numdigits; i++)
14196 *b1++ = '0';
14197 for (i = 0; i < numdigits; i++)
14198 *b1++ = *buf++;
14199 *b1 = '\0';
14200 Py_DECREF(result);
14201 result = r1;
14202 buf = PyBytes_AS_STRING(result);
14203 len = numnondigits + prec;
14204 }
14205
14206 /* Fix up case for hex conversions. */
14207 if (type == 'X') {
14208 /* Need to convert all lower case letters to upper case.
14209 and need to convert 0x to 0X (and -0x to -0X). */
14210 for (i = 0; i < len; i++)
14211 if (buf[i] >= 'a' && buf[i] <= 'x')
14212 buf[i] -= 'a'-'A';
14213 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014214 if (!PyUnicode_Check(result)
14215 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014216 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014217 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014218 Py_DECREF(result);
14219 result = unicode;
14220 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014221 else if (len != PyUnicode_GET_LENGTH(result)) {
14222 if (PyUnicode_Resize(&result, len) < 0)
14223 Py_CLEAR(result);
14224 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014226}
14227
Ethan Furmandf3ed242014-01-05 06:50:30 -080014228/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014229 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014230 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014231 * -1 and raise an exception on error */
14232static int
Victor Stinnera47082312012-10-04 02:19:54 +020014233mainformatlong(PyObject *v,
14234 struct unicode_format_arg_t *arg,
14235 PyObject **p_output,
14236 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014237{
14238 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014239 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014240
14241 if (!PyNumber_Check(v))
14242 goto wrongtype;
14243
Ethan Furman9ab74802014-03-21 06:38:46 -070014244 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014245 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014246 if (type == 'o' || type == 'x' || type == 'X') {
14247 iobj = PyNumber_Index(v);
14248 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014249 if (PyErr_ExceptionMatches(PyExc_TypeError))
14250 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014251 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014252 }
14253 }
14254 else {
14255 iobj = PyNumber_Long(v);
14256 if (iobj == NULL ) {
14257 if (PyErr_ExceptionMatches(PyExc_TypeError))
14258 goto wrongtype;
14259 return -1;
14260 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014261 }
14262 assert(PyLong_Check(iobj));
14263 }
14264 else {
14265 iobj = v;
14266 Py_INCREF(iobj);
14267 }
14268
14269 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014270 && arg->width == -1 && arg->prec == -1
14271 && !(arg->flags & (F_SIGN | F_BLANK))
14272 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014273 {
14274 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014275 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014276 int base;
14277
Victor Stinnera47082312012-10-04 02:19:54 +020014278 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014279 {
14280 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014281 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014282 case 'd':
14283 case 'i':
14284 case 'u':
14285 base = 10;
14286 break;
14287 case 'o':
14288 base = 8;
14289 break;
14290 case 'x':
14291 case 'X':
14292 base = 16;
14293 break;
14294 }
14295
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014296 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14297 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014299 }
14300 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301 return 1;
14302 }
14303
Ethan Furmanb95b5612015-01-23 20:05:18 -080014304 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014305 Py_DECREF(iobj);
14306 if (res == NULL)
14307 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014308 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014309 return 0;
14310
14311wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014312 switch(type)
14313 {
14314 case 'o':
14315 case 'x':
14316 case 'X':
14317 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014318 "%%%c format: an integer is required, "
14319 "not %.200s",
14320 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014321 break;
14322 default:
14323 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014324 "%%%c format: a number is required, "
14325 "not %.200s",
14326 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014327 break;
14328 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014329 return -1;
14330}
14331
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014332static Py_UCS4
14333formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014334{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014335 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014336 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014337 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014338 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014339 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014340 goto onError;
14341 }
14342 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014343 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014344 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014345 /* make sure number is a type of integer */
14346 if (!PyLong_Check(v)) {
14347 iobj = PyNumber_Index(v);
14348 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014349 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014350 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014351 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014352 Py_DECREF(iobj);
14353 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014354 else {
14355 x = PyLong_AsLong(v);
14356 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014357 if (x == -1 && PyErr_Occurred())
14358 goto onError;
14359
Victor Stinner8faf8212011-12-08 22:14:11 +010014360 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014361 PyErr_SetString(PyExc_OverflowError,
14362 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014363 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014364 }
14365
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014366 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014368
Benjamin Peterson29060642009-01-31 22:14:21 +000014369 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014370 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014371 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014372 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014373}
14374
Victor Stinnera47082312012-10-04 02:19:54 +020014375/* Parse options of an argument: flags, width, precision.
14376 Handle also "%(name)" syntax.
14377
14378 Return 0 if the argument has been formatted into arg->str.
14379 Return 1 if the argument has been written into ctx->writer,
14380 Raise an exception and return -1 on error. */
14381static int
14382unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14383 struct unicode_format_arg_t *arg)
14384{
14385#define FORMAT_READ(ctx) \
14386 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14387
14388 PyObject *v;
14389
Victor Stinnera47082312012-10-04 02:19:54 +020014390 if (arg->ch == '(') {
14391 /* Get argument value from a dictionary. Example: "%(name)s". */
14392 Py_ssize_t keystart;
14393 Py_ssize_t keylen;
14394 PyObject *key;
14395 int pcount = 1;
14396
14397 if (ctx->dict == NULL) {
14398 PyErr_SetString(PyExc_TypeError,
14399 "format requires a mapping");
14400 return -1;
14401 }
14402 ++ctx->fmtpos;
14403 --ctx->fmtcnt;
14404 keystart = ctx->fmtpos;
14405 /* Skip over balanced parentheses */
14406 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14407 arg->ch = FORMAT_READ(ctx);
14408 if (arg->ch == ')')
14409 --pcount;
14410 else if (arg->ch == '(')
14411 ++pcount;
14412 ctx->fmtpos++;
14413 }
14414 keylen = ctx->fmtpos - keystart - 1;
14415 if (ctx->fmtcnt < 0 || pcount > 0) {
14416 PyErr_SetString(PyExc_ValueError,
14417 "incomplete format key");
14418 return -1;
14419 }
14420 key = PyUnicode_Substring(ctx->fmtstr,
14421 keystart, keystart + keylen);
14422 if (key == NULL)
14423 return -1;
14424 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014425 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014426 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014427 }
14428 ctx->args = PyObject_GetItem(ctx->dict, key);
14429 Py_DECREF(key);
14430 if (ctx->args == NULL)
14431 return -1;
14432 ctx->args_owned = 1;
14433 ctx->arglen = -1;
14434 ctx->argidx = -2;
14435 }
14436
14437 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014438 while (--ctx->fmtcnt >= 0) {
14439 arg->ch = FORMAT_READ(ctx);
14440 ctx->fmtpos++;
14441 switch (arg->ch) {
14442 case '-': arg->flags |= F_LJUST; continue;
14443 case '+': arg->flags |= F_SIGN; continue;
14444 case ' ': arg->flags |= F_BLANK; continue;
14445 case '#': arg->flags |= F_ALT; continue;
14446 case '0': arg->flags |= F_ZERO; continue;
14447 }
14448 break;
14449 }
14450
14451 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014452 if (arg->ch == '*') {
14453 v = unicode_format_getnextarg(ctx);
14454 if (v == NULL)
14455 return -1;
14456 if (!PyLong_Check(v)) {
14457 PyErr_SetString(PyExc_TypeError,
14458 "* wants int");
14459 return -1;
14460 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014461 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014462 if (arg->width == -1 && PyErr_Occurred())
14463 return -1;
14464 if (arg->width < 0) {
14465 arg->flags |= F_LJUST;
14466 arg->width = -arg->width;
14467 }
14468 if (--ctx->fmtcnt >= 0) {
14469 arg->ch = FORMAT_READ(ctx);
14470 ctx->fmtpos++;
14471 }
14472 }
14473 else if (arg->ch >= '0' && arg->ch <= '9') {
14474 arg->width = arg->ch - '0';
14475 while (--ctx->fmtcnt >= 0) {
14476 arg->ch = FORMAT_READ(ctx);
14477 ctx->fmtpos++;
14478 if (arg->ch < '0' || arg->ch > '9')
14479 break;
14480 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14481 mixing signed and unsigned comparison. Since arg->ch is between
14482 '0' and '9', casting to int is safe. */
14483 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14484 PyErr_SetString(PyExc_ValueError,
14485 "width too big");
14486 return -1;
14487 }
14488 arg->width = arg->width*10 + (arg->ch - '0');
14489 }
14490 }
14491
14492 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014493 if (arg->ch == '.') {
14494 arg->prec = 0;
14495 if (--ctx->fmtcnt >= 0) {
14496 arg->ch = FORMAT_READ(ctx);
14497 ctx->fmtpos++;
14498 }
14499 if (arg->ch == '*') {
14500 v = unicode_format_getnextarg(ctx);
14501 if (v == NULL)
14502 return -1;
14503 if (!PyLong_Check(v)) {
14504 PyErr_SetString(PyExc_TypeError,
14505 "* wants int");
14506 return -1;
14507 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014508 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014509 if (arg->prec == -1 && PyErr_Occurred())
14510 return -1;
14511 if (arg->prec < 0)
14512 arg->prec = 0;
14513 if (--ctx->fmtcnt >= 0) {
14514 arg->ch = FORMAT_READ(ctx);
14515 ctx->fmtpos++;
14516 }
14517 }
14518 else if (arg->ch >= '0' && arg->ch <= '9') {
14519 arg->prec = arg->ch - '0';
14520 while (--ctx->fmtcnt >= 0) {
14521 arg->ch = FORMAT_READ(ctx);
14522 ctx->fmtpos++;
14523 if (arg->ch < '0' || arg->ch > '9')
14524 break;
14525 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14526 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014527 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014528 return -1;
14529 }
14530 arg->prec = arg->prec*10 + (arg->ch - '0');
14531 }
14532 }
14533 }
14534
14535 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14536 if (ctx->fmtcnt >= 0) {
14537 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14538 if (--ctx->fmtcnt >= 0) {
14539 arg->ch = FORMAT_READ(ctx);
14540 ctx->fmtpos++;
14541 }
14542 }
14543 }
14544 if (ctx->fmtcnt < 0) {
14545 PyErr_SetString(PyExc_ValueError,
14546 "incomplete format");
14547 return -1;
14548 }
14549 return 0;
14550
14551#undef FORMAT_READ
14552}
14553
14554/* Format one argument. Supported conversion specifiers:
14555
14556 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014557 - "i", "d", "u": int or float
14558 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014559 - "e", "E", "f", "F", "g", "G": float
14560 - "c": int or str (1 character)
14561
Victor Stinner8dbd4212012-12-04 09:30:24 +010014562 When possible, the output is written directly into the Unicode writer
14563 (ctx->writer). A string is created when padding is required.
14564
Victor Stinnera47082312012-10-04 02:19:54 +020014565 Return 0 if the argument has been formatted into *p_str,
14566 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014567 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014568static int
14569unicode_format_arg_format(struct unicode_formatter_t *ctx,
14570 struct unicode_format_arg_t *arg,
14571 PyObject **p_str)
14572{
14573 PyObject *v;
14574 _PyUnicodeWriter *writer = &ctx->writer;
14575
14576 if (ctx->fmtcnt == 0)
14577 ctx->writer.overallocate = 0;
14578
Victor Stinnera47082312012-10-04 02:19:54 +020014579 v = unicode_format_getnextarg(ctx);
14580 if (v == NULL)
14581 return -1;
14582
Victor Stinnera47082312012-10-04 02:19:54 +020014583
14584 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014585 case 's':
14586 case 'r':
14587 case 'a':
14588 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14589 /* Fast path */
14590 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14591 return -1;
14592 return 1;
14593 }
14594
14595 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14596 *p_str = v;
14597 Py_INCREF(*p_str);
14598 }
14599 else {
14600 if (arg->ch == 's')
14601 *p_str = PyObject_Str(v);
14602 else if (arg->ch == 'r')
14603 *p_str = PyObject_Repr(v);
14604 else
14605 *p_str = PyObject_ASCII(v);
14606 }
14607 break;
14608
14609 case 'i':
14610 case 'd':
14611 case 'u':
14612 case 'o':
14613 case 'x':
14614 case 'X':
14615 {
14616 int ret = mainformatlong(v, arg, p_str, writer);
14617 if (ret != 0)
14618 return ret;
14619 arg->sign = 1;
14620 break;
14621 }
14622
14623 case 'e':
14624 case 'E':
14625 case 'f':
14626 case 'F':
14627 case 'g':
14628 case 'G':
14629 if (arg->width == -1 && arg->prec == -1
14630 && !(arg->flags & (F_SIGN | F_BLANK)))
14631 {
14632 /* Fast path */
14633 if (formatfloat(v, arg, NULL, writer) == -1)
14634 return -1;
14635 return 1;
14636 }
14637
14638 arg->sign = 1;
14639 if (formatfloat(v, arg, p_str, NULL) == -1)
14640 return -1;
14641 break;
14642
14643 case 'c':
14644 {
14645 Py_UCS4 ch = formatchar(v);
14646 if (ch == (Py_UCS4) -1)
14647 return -1;
14648 if (arg->width == -1 && arg->prec == -1) {
14649 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014650 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014651 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014652 return 1;
14653 }
14654 *p_str = PyUnicode_FromOrdinal(ch);
14655 break;
14656 }
14657
14658 default:
14659 PyErr_Format(PyExc_ValueError,
14660 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014661 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014662 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14663 (int)arg->ch,
14664 ctx->fmtpos - 1);
14665 return -1;
14666 }
14667 if (*p_str == NULL)
14668 return -1;
14669 assert (PyUnicode_Check(*p_str));
14670 return 0;
14671}
14672
14673static int
14674unicode_format_arg_output(struct unicode_formatter_t *ctx,
14675 struct unicode_format_arg_t *arg,
14676 PyObject *str)
14677{
14678 Py_ssize_t len;
14679 enum PyUnicode_Kind kind;
14680 void *pbuf;
14681 Py_ssize_t pindex;
14682 Py_UCS4 signchar;
14683 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014684 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014685 Py_ssize_t sublen;
14686 _PyUnicodeWriter *writer = &ctx->writer;
14687 Py_UCS4 fill;
14688
14689 fill = ' ';
14690 if (arg->sign && arg->flags & F_ZERO)
14691 fill = '0';
14692
14693 if (PyUnicode_READY(str) == -1)
14694 return -1;
14695
14696 len = PyUnicode_GET_LENGTH(str);
14697 if ((arg->width == -1 || arg->width <= len)
14698 && (arg->prec == -1 || arg->prec >= len)
14699 && !(arg->flags & (F_SIGN | F_BLANK)))
14700 {
14701 /* Fast path */
14702 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14703 return -1;
14704 return 0;
14705 }
14706
14707 /* Truncate the string for "s", "r" and "a" formats
14708 if the precision is set */
14709 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14710 if (arg->prec >= 0 && len > arg->prec)
14711 len = arg->prec;
14712 }
14713
14714 /* Adjust sign and width */
14715 kind = PyUnicode_KIND(str);
14716 pbuf = PyUnicode_DATA(str);
14717 pindex = 0;
14718 signchar = '\0';
14719 if (arg->sign) {
14720 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14721 if (ch == '-' || ch == '+') {
14722 signchar = ch;
14723 len--;
14724 pindex++;
14725 }
14726 else if (arg->flags & F_SIGN)
14727 signchar = '+';
14728 else if (arg->flags & F_BLANK)
14729 signchar = ' ';
14730 else
14731 arg->sign = 0;
14732 }
14733 if (arg->width < len)
14734 arg->width = len;
14735
14736 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014737 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014738 if (!(arg->flags & F_LJUST)) {
14739 if (arg->sign) {
14740 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014741 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014742 }
14743 else {
14744 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014745 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014746 }
14747 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014748 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14749 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014750 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014751 }
14752
Victor Stinnera47082312012-10-04 02:19:54 +020014753 buflen = arg->width;
14754 if (arg->sign && len == arg->width)
14755 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014756 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014757 return -1;
14758
14759 /* Write the sign if needed */
14760 if (arg->sign) {
14761 if (fill != ' ') {
14762 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14763 writer->pos += 1;
14764 }
14765 if (arg->width > len)
14766 arg->width--;
14767 }
14768
14769 /* Write the numeric prefix for "x", "X" and "o" formats
14770 if the alternate form is used.
14771 For example, write "0x" for the "%#x" format. */
14772 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14773 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14774 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14775 if (fill != ' ') {
14776 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14777 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14778 writer->pos += 2;
14779 pindex += 2;
14780 }
14781 arg->width -= 2;
14782 if (arg->width < 0)
14783 arg->width = 0;
14784 len -= 2;
14785 }
14786
14787 /* Pad left with the fill character if needed */
14788 if (arg->width > len && !(arg->flags & F_LJUST)) {
14789 sublen = arg->width - len;
14790 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14791 writer->pos += sublen;
14792 arg->width = len;
14793 }
14794
14795 /* If padding with spaces: write sign if needed and/or numeric prefix if
14796 the alternate form is used */
14797 if (fill == ' ') {
14798 if (arg->sign) {
14799 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14800 writer->pos += 1;
14801 }
14802 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14803 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14804 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14805 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14806 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14807 writer->pos += 2;
14808 pindex += 2;
14809 }
14810 }
14811
14812 /* Write characters */
14813 if (len) {
14814 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14815 str, pindex, len);
14816 writer->pos += len;
14817 }
14818
14819 /* Pad right with the fill character if needed */
14820 if (arg->width > len) {
14821 sublen = arg->width - len;
14822 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14823 writer->pos += sublen;
14824 }
14825 return 0;
14826}
14827
14828/* Helper of PyUnicode_Format(): format one arg.
14829 Return 0 on success, raise an exception and return -1 on error. */
14830static int
14831unicode_format_arg(struct unicode_formatter_t *ctx)
14832{
14833 struct unicode_format_arg_t arg;
14834 PyObject *str;
14835 int ret;
14836
Victor Stinner8dbd4212012-12-04 09:30:24 +010014837 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014838 if (arg.ch == '%') {
14839 ctx->fmtpos++;
14840 ctx->fmtcnt--;
14841 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14842 return -1;
14843 return 0;
14844 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014845 arg.flags = 0;
14846 arg.width = -1;
14847 arg.prec = -1;
14848 arg.sign = 0;
14849 str = NULL;
14850
Victor Stinnera47082312012-10-04 02:19:54 +020014851 ret = unicode_format_arg_parse(ctx, &arg);
14852 if (ret == -1)
14853 return -1;
14854
14855 ret = unicode_format_arg_format(ctx, &arg, &str);
14856 if (ret == -1)
14857 return -1;
14858
14859 if (ret != 1) {
14860 ret = unicode_format_arg_output(ctx, &arg, str);
14861 Py_DECREF(str);
14862 if (ret == -1)
14863 return -1;
14864 }
14865
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014866 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014867 PyErr_SetString(PyExc_TypeError,
14868 "not all arguments converted during string formatting");
14869 return -1;
14870 }
14871 return 0;
14872}
14873
Alexander Belopolsky40018472011-02-26 01:02:56 +000014874PyObject *
14875PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014876{
Victor Stinnera47082312012-10-04 02:19:54 +020014877 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014878
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014880 PyErr_BadInternalCall();
14881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014882 }
Victor Stinnera47082312012-10-04 02:19:54 +020014883
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014884 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014885 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014886
14887 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014888 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14889 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14890 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14891 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014892
Victor Stinner8f674cc2013-04-17 23:02:17 +020014893 _PyUnicodeWriter_Init(&ctx.writer);
14894 ctx.writer.min_length = ctx.fmtcnt + 100;
14895 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014896
Guido van Rossumd57fd912000-03-10 22:53:23 +000014897 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014898 ctx.arglen = PyTuple_Size(args);
14899 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014900 }
14901 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014902 ctx.arglen = -1;
14903 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014904 }
Victor Stinnera47082312012-10-04 02:19:54 +020014905 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014906 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014907 ctx.dict = args;
14908 else
14909 ctx.dict = NULL;
14910 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014911
Victor Stinnera47082312012-10-04 02:19:54 +020014912 while (--ctx.fmtcnt >= 0) {
14913 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014914 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014915
14916 nonfmtpos = ctx.fmtpos++;
14917 while (ctx.fmtcnt >= 0 &&
14918 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14919 ctx.fmtpos++;
14920 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014921 }
Victor Stinnera47082312012-10-04 02:19:54 +020014922 if (ctx.fmtcnt < 0) {
14923 ctx.fmtpos--;
14924 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014925 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014926
Victor Stinnercfc4c132013-04-03 01:48:39 +020014927 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14928 nonfmtpos, ctx.fmtpos) < 0)
14929 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014930 }
14931 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014932 ctx.fmtpos++;
14933 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014934 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014935 }
14936 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014937
Victor Stinnera47082312012-10-04 02:19:54 +020014938 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014939 PyErr_SetString(PyExc_TypeError,
14940 "not all arguments converted during string formatting");
14941 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014942 }
14943
Victor Stinnera47082312012-10-04 02:19:54 +020014944 if (ctx.args_owned) {
14945 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946 }
Victor Stinnera47082312012-10-04 02:19:54 +020014947 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014948
Benjamin Peterson29060642009-01-31 22:14:21 +000014949 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014950 _PyUnicodeWriter_Dealloc(&ctx.writer);
14951 if (ctx.args_owned) {
14952 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953 }
14954 return NULL;
14955}
14956
Jeremy Hylton938ace62002-07-17 16:30:39 +000014957static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014958unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14959
Tim Peters6d6c1a32001-08-02 04:15:00 +000014960static PyObject *
14961unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14962{
Benjamin Peterson29060642009-01-31 22:14:21 +000014963 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014964 static char *kwlist[] = {"object", "encoding", "errors", 0};
14965 char *encoding = NULL;
14966 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014967
Benjamin Peterson14339b62009-01-31 16:36:08 +000014968 if (type != &PyUnicode_Type)
14969 return unicode_subtype_new(type, args, kwds);
14970 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014971 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 return NULL;
14973 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014974 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014975 if (encoding == NULL && errors == NULL)
14976 return PyObject_Str(x);
14977 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014978 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014979}
14980
Guido van Rossume023fe02001-08-30 03:12:59 +000014981static PyObject *
14982unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14983{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014984 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014985 Py_ssize_t length, char_size;
14986 int share_wstr, share_utf8;
14987 unsigned int kind;
14988 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014989
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014991
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014992 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014993 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014994 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014995 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014996 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014997 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014998 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014999 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015000
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015001 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015002 if (self == NULL) {
15003 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015004 return NULL;
15005 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015006 kind = PyUnicode_KIND(unicode);
15007 length = PyUnicode_GET_LENGTH(unicode);
15008
15009 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015010#ifdef Py_DEBUG
15011 _PyUnicode_HASH(self) = -1;
15012#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015013 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015014#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015015 _PyUnicode_STATE(self).interned = 0;
15016 _PyUnicode_STATE(self).kind = kind;
15017 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015018 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015019 _PyUnicode_STATE(self).ready = 1;
15020 _PyUnicode_WSTR(self) = NULL;
15021 _PyUnicode_UTF8_LENGTH(self) = 0;
15022 _PyUnicode_UTF8(self) = NULL;
15023 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015024 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015025
15026 share_utf8 = 0;
15027 share_wstr = 0;
15028 if (kind == PyUnicode_1BYTE_KIND) {
15029 char_size = 1;
15030 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15031 share_utf8 = 1;
15032 }
15033 else if (kind == PyUnicode_2BYTE_KIND) {
15034 char_size = 2;
15035 if (sizeof(wchar_t) == 2)
15036 share_wstr = 1;
15037 }
15038 else {
15039 assert(kind == PyUnicode_4BYTE_KIND);
15040 char_size = 4;
15041 if (sizeof(wchar_t) == 4)
15042 share_wstr = 1;
15043 }
15044
15045 /* Ensure we won't overflow the length. */
15046 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15047 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015048 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015050 data = PyObject_MALLOC((length + 1) * char_size);
15051 if (data == NULL) {
15052 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015053 goto onError;
15054 }
15055
Victor Stinnerc3c74152011-10-02 20:39:55 +020015056 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057 if (share_utf8) {
15058 _PyUnicode_UTF8_LENGTH(self) = length;
15059 _PyUnicode_UTF8(self) = data;
15060 }
15061 if (share_wstr) {
15062 _PyUnicode_WSTR_LENGTH(self) = length;
15063 _PyUnicode_WSTR(self) = (wchar_t *)data;
15064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015065
Christian Heimesf051e432016-09-13 20:22:02 +020015066 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015067 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015068 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015069#ifdef Py_DEBUG
15070 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15071#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015072 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015073 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015074
15075onError:
15076 Py_DECREF(unicode);
15077 Py_DECREF(self);
15078 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015079}
15080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015081PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015082"str(object='') -> str\n\
15083str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015084\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015085Create a new string object from the given object. If encoding or\n\
15086errors is specified, then the object must expose a data buffer\n\
15087that will be decoded using the given encoding and error handler.\n\
15088Otherwise, returns the result of object.__str__() (if defined)\n\
15089or repr(object).\n\
15090encoding defaults to sys.getdefaultencoding().\n\
15091errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015092
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015093static PyObject *unicode_iter(PyObject *seq);
15094
Guido van Rossumd57fd912000-03-10 22:53:23 +000015095PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015096 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015097 "str", /* tp_name */
15098 sizeof(PyUnicodeObject), /* tp_basicsize */
15099 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015100 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015101 (destructor)unicode_dealloc, /* tp_dealloc */
15102 0, /* tp_print */
15103 0, /* tp_getattr */
15104 0, /* tp_setattr */
15105 0, /* tp_reserved */
15106 unicode_repr, /* tp_repr */
15107 &unicode_as_number, /* tp_as_number */
15108 &unicode_as_sequence, /* tp_as_sequence */
15109 &unicode_as_mapping, /* tp_as_mapping */
15110 (hashfunc) unicode_hash, /* tp_hash*/
15111 0, /* tp_call*/
15112 (reprfunc) unicode_str, /* tp_str */
15113 PyObject_GenericGetAttr, /* tp_getattro */
15114 0, /* tp_setattro */
15115 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015117 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15118 unicode_doc, /* tp_doc */
15119 0, /* tp_traverse */
15120 0, /* tp_clear */
15121 PyUnicode_RichCompare, /* tp_richcompare */
15122 0, /* tp_weaklistoffset */
15123 unicode_iter, /* tp_iter */
15124 0, /* tp_iternext */
15125 unicode_methods, /* tp_methods */
15126 0, /* tp_members */
15127 0, /* tp_getset */
15128 &PyBaseObject_Type, /* tp_base */
15129 0, /* tp_dict */
15130 0, /* tp_descr_get */
15131 0, /* tp_descr_set */
15132 0, /* tp_dictoffset */
15133 0, /* tp_init */
15134 0, /* tp_alloc */
15135 unicode_new, /* tp_new */
15136 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015137};
15138
15139/* Initialize the Unicode implementation */
15140
Victor Stinner3a50e702011-10-18 21:21:00 +020015141int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015142{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015143 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015144 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015145 0x000A, /* LINE FEED */
15146 0x000D, /* CARRIAGE RETURN */
15147 0x001C, /* FILE SEPARATOR */
15148 0x001D, /* GROUP SEPARATOR */
15149 0x001E, /* RECORD SEPARATOR */
15150 0x0085, /* NEXT LINE */
15151 0x2028, /* LINE SEPARATOR */
15152 0x2029, /* PARAGRAPH SEPARATOR */
15153 };
15154
Fred Drakee4315f52000-05-09 19:53:39 +000015155 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015156 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015157 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015158 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015159 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015160
Guido van Rossumcacfc072002-05-24 19:01:59 +000015161 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015162 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015163
15164 /* initialize the linebreak bloom filter */
15165 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015166 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015167 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015168
Christian Heimes26532f72013-07-20 14:57:16 +020015169 if (PyType_Ready(&EncodingMapType) < 0)
15170 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015171
Benjamin Petersonc4311282012-10-30 23:21:10 -040015172 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15173 Py_FatalError("Can't initialize field name iterator type");
15174
15175 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15176 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015177
Victor Stinner3a50e702011-10-18 21:21:00 +020015178 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015179}
15180
15181/* Finalize the Unicode implementation */
15182
Christian Heimesa156e092008-02-16 07:38:31 +000015183int
15184PyUnicode_ClearFreeList(void)
15185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015186 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015187}
15188
Guido van Rossumd57fd912000-03-10 22:53:23 +000015189void
Thomas Wouters78890102000-07-22 19:25:51 +000015190_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015191{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015192 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015193
Serhiy Storchaka05997252013-01-26 12:14:02 +020015194 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015195
Serhiy Storchaka05997252013-01-26 12:14:02 +020015196 for (i = 0; i < 256; i++)
15197 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015198 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015199 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015200}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015201
Walter Dörwald16807132007-05-25 13:52:07 +000015202void
15203PyUnicode_InternInPlace(PyObject **p)
15204{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015205 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015206 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015207#ifdef Py_DEBUG
15208 assert(s != NULL);
15209 assert(_PyUnicode_CHECK(s));
15210#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015212 return;
15213#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 /* If it's a subclass, we don't really know what putting
15215 it in the interned dict might do. */
15216 if (!PyUnicode_CheckExact(s))
15217 return;
15218 if (PyUnicode_CHECK_INTERNED(s))
15219 return;
15220 if (interned == NULL) {
15221 interned = PyDict_New();
15222 if (interned == NULL) {
15223 PyErr_Clear(); /* Don't leave an exception */
15224 return;
15225 }
15226 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015228 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015230 if (t == NULL) {
15231 PyErr_Clear();
15232 return;
15233 }
15234 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015235 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015236 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015237 return;
15238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 /* The two references in interned are not counted by refcnt.
15240 The deallocator will take care of this */
15241 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015242 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015243}
15244
15245void
15246PyUnicode_InternImmortal(PyObject **p)
15247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 PyUnicode_InternInPlace(p);
15249 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015250 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015251 Py_INCREF(*p);
15252 }
Walter Dörwald16807132007-05-25 13:52:07 +000015253}
15254
15255PyObject *
15256PyUnicode_InternFromString(const char *cp)
15257{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015258 PyObject *s = PyUnicode_FromString(cp);
15259 if (s == NULL)
15260 return NULL;
15261 PyUnicode_InternInPlace(&s);
15262 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015263}
15264
Alexander Belopolsky40018472011-02-26 01:02:56 +000015265void
15266_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015269 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 Py_ssize_t i, n;
15271 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015272
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 if (interned == NULL || !PyDict_Check(interned))
15274 return;
15275 keys = PyDict_Keys(interned);
15276 if (keys == NULL || !PyList_Check(keys)) {
15277 PyErr_Clear();
15278 return;
15279 }
Walter Dörwald16807132007-05-25 13:52:07 +000015280
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15282 detector, interned unicode strings are not forcibly deallocated;
15283 rather, we give them their stolen references back, and then clear
15284 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015285
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 n = PyList_GET_SIZE(keys);
15287 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015288 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015290 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015291 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015292 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015294 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 case SSTATE_NOT_INTERNED:
15296 /* XXX Shouldn't happen */
15297 break;
15298 case SSTATE_INTERNED_IMMORTAL:
15299 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015300 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 break;
15302 case SSTATE_INTERNED_MORTAL:
15303 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015304 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015305 break;
15306 default:
15307 Py_FatalError("Inconsistent interned string state.");
15308 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015309 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 }
15311 fprintf(stderr, "total size of all interned strings: "
15312 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15313 "mortal/immortal\n", mortal_size, immortal_size);
15314 Py_DECREF(keys);
15315 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015316 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015317}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015318
15319
15320/********************* Unicode Iterator **************************/
15321
15322typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 PyObject_HEAD
15324 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015325 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015326} unicodeiterobject;
15327
15328static void
15329unicodeiter_dealloc(unicodeiterobject *it)
15330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 _PyObject_GC_UNTRACK(it);
15332 Py_XDECREF(it->it_seq);
15333 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015334}
15335
15336static int
15337unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15338{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 Py_VISIT(it->it_seq);
15340 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015341}
15342
15343static PyObject *
15344unicodeiter_next(unicodeiterobject *it)
15345{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015346 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015347
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 assert(it != NULL);
15349 seq = it->it_seq;
15350 if (seq == NULL)
15351 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015352 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015354 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15355 int kind = PyUnicode_KIND(seq);
15356 void *data = PyUnicode_DATA(seq);
15357 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15358 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 if (item != NULL)
15360 ++it->it_index;
15361 return item;
15362 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015363
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015365 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015367}
15368
15369static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015370unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015371{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015372 Py_ssize_t len = 0;
15373 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015374 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015376}
15377
15378PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15379
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015380static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015381unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015382{
15383 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015384 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015385 it->it_seq, it->it_index);
15386 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015387 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015388 if (u == NULL)
15389 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015390 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015391 }
15392}
15393
15394PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15395
15396static PyObject *
15397unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15398{
15399 Py_ssize_t index = PyLong_AsSsize_t(state);
15400 if (index == -1 && PyErr_Occurred())
15401 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015402 if (it->it_seq != NULL) {
15403 if (index < 0)
15404 index = 0;
15405 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15406 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15407 it->it_index = index;
15408 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015409 Py_RETURN_NONE;
15410}
15411
15412PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15413
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015414static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015416 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015417 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15418 reduce_doc},
15419 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15420 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015422};
15423
15424PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015425 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15426 "str_iterator", /* tp_name */
15427 sizeof(unicodeiterobject), /* tp_basicsize */
15428 0, /* tp_itemsize */
15429 /* methods */
15430 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15431 0, /* tp_print */
15432 0, /* tp_getattr */
15433 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015434 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 0, /* tp_repr */
15436 0, /* tp_as_number */
15437 0, /* tp_as_sequence */
15438 0, /* tp_as_mapping */
15439 0, /* tp_hash */
15440 0, /* tp_call */
15441 0, /* tp_str */
15442 PyObject_GenericGetAttr, /* tp_getattro */
15443 0, /* tp_setattro */
15444 0, /* tp_as_buffer */
15445 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15446 0, /* tp_doc */
15447 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15448 0, /* tp_clear */
15449 0, /* tp_richcompare */
15450 0, /* tp_weaklistoffset */
15451 PyObject_SelfIter, /* tp_iter */
15452 (iternextfunc)unicodeiter_next, /* tp_iternext */
15453 unicodeiter_methods, /* tp_methods */
15454 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015455};
15456
15457static PyObject *
15458unicode_iter(PyObject *seq)
15459{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015461
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 if (!PyUnicode_Check(seq)) {
15463 PyErr_BadInternalCall();
15464 return NULL;
15465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015466 if (PyUnicode_READY(seq) == -1)
15467 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15469 if (it == NULL)
15470 return NULL;
15471 it->it_index = 0;
15472 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015473 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 _PyObject_GC_TRACK(it);
15475 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015476}
15477
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015478
15479size_t
15480Py_UNICODE_strlen(const Py_UNICODE *u)
15481{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015482 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015483}
15484
15485Py_UNICODE*
15486Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15487{
15488 Py_UNICODE *u = s1;
15489 while ((*u++ = *s2++));
15490 return s1;
15491}
15492
15493Py_UNICODE*
15494Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15495{
15496 Py_UNICODE *u = s1;
15497 while ((*u++ = *s2++))
15498 if (n-- == 0)
15499 break;
15500 return s1;
15501}
15502
15503Py_UNICODE*
15504Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15505{
15506 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015507 u1 += wcslen(u1);
15508 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015509 return s1;
15510}
15511
15512int
15513Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15514{
15515 while (*s1 && *s2 && *s1 == *s2)
15516 s1++, s2++;
15517 if (*s1 && *s2)
15518 return (*s1 < *s2) ? -1 : +1;
15519 if (*s1)
15520 return 1;
15521 if (*s2)
15522 return -1;
15523 return 0;
15524}
15525
15526int
15527Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15528{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015529 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015530 for (; n != 0; n--) {
15531 u1 = *s1;
15532 u2 = *s2;
15533 if (u1 != u2)
15534 return (u1 < u2) ? -1 : +1;
15535 if (u1 == '\0')
15536 return 0;
15537 s1++;
15538 s2++;
15539 }
15540 return 0;
15541}
15542
15543Py_UNICODE*
15544Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15545{
15546 const Py_UNICODE *p;
15547 for (p = s; *p; p++)
15548 if (*p == c)
15549 return (Py_UNICODE*)p;
15550 return NULL;
15551}
15552
15553Py_UNICODE*
15554Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15555{
15556 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015557 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015558 while (p != s) {
15559 p--;
15560 if (*p == c)
15561 return (Py_UNICODE*)p;
15562 }
15563 return NULL;
15564}
Victor Stinner331ea922010-08-10 16:37:20 +000015565
Victor Stinner71133ff2010-09-01 23:43:53 +000015566Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015567PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015568{
Victor Stinner577db2c2011-10-11 22:12:48 +020015569 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015570 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015572 if (!PyUnicode_Check(unicode)) {
15573 PyErr_BadArgument();
15574 return NULL;
15575 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015576 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015577 if (u == NULL)
15578 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015579 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015580 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015581 PyErr_NoMemory();
15582 return NULL;
15583 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015584 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015585 size *= sizeof(Py_UNICODE);
15586 copy = PyMem_Malloc(size);
15587 if (copy == NULL) {
15588 PyErr_NoMemory();
15589 return NULL;
15590 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015591 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015592 return copy;
15593}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015594
Georg Brandl66c221e2010-10-14 07:04:07 +000015595/* A _string module, to export formatter_parser and formatter_field_name_split
15596 to the string.Formatter class implemented in Python. */
15597
15598static PyMethodDef _string_methods[] = {
15599 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15600 METH_O, PyDoc_STR("split the argument as a field name")},
15601 {"formatter_parser", (PyCFunction) formatter_parser,
15602 METH_O, PyDoc_STR("parse the argument as a format string")},
15603 {NULL, NULL}
15604};
15605
15606static struct PyModuleDef _string_module = {
15607 PyModuleDef_HEAD_INIT,
15608 "_string",
15609 PyDoc_STR("string helper module"),
15610 0,
15611 _string_methods,
15612 NULL,
15613 NULL,
15614 NULL,
15615 NULL
15616};
15617
15618PyMODINIT_FUNC
15619PyInit__string(void)
15620{
15621 return PyModule_Create(&_string_module);
15622}
15623
15624
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015625#ifdef __cplusplus
15626}
15627#endif