blob: a797f838eb413977caf9cc912c31062e010fefd2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner3d4226a2018-08-29 22:21:32 +0200321_Py_error_handler
322_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200323{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200325 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200326 }
327 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200328 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200329 }
330 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200331 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200332 }
333 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200334 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 }
336 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200337 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200340 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
Victor Stinner50149202015-09-22 00:26:54 +0200345 return _Py_ERROR_OTHER;
346}
347
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300348/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
349 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000351PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000352{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000353#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000355#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 /* This is actually an illegal character, so it should
357 not be passed to unichr. */
358 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000359#endif
360}
361
Victor Stinner910337b2011-10-03 03:20:16 +0200362#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200363int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100364_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200365{
366 PyASCIIObject *ascii;
367 unsigned int kind;
368
369 assert(PyUnicode_Check(op));
370
371 ascii = (PyASCIIObject *)op;
372 kind = ascii->state.kind;
373
Victor Stinnera3b334d2011-10-03 13:53:37 +0200374 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200375 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
377 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200378 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200379 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200380 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200381
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 if (ascii->state.compact == 1) {
383 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200384 assert(kind == PyUnicode_1BYTE_KIND
385 || kind == PyUnicode_2BYTE_KIND
386 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200388 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100390 }
391 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200392 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
393
394 data = unicode->data.any;
395 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->length == 0);
397 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ascii == 0);
400 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200402 assert(ascii->wstr != NULL);
403 assert(data == NULL);
404 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200405 }
406 else {
407 assert(kind == PyUnicode_1BYTE_KIND
408 || kind == PyUnicode_2BYTE_KIND
409 || kind == PyUnicode_4BYTE_KIND);
410 assert(ascii->state.compact == 0);
411 assert(ascii->state.ready == 1);
412 assert(data != NULL);
413 if (ascii->state.ascii) {
414 assert (compact->utf8 == data);
415 assert (compact->utf8_length == ascii->length);
416 }
417 else
418 assert (compact->utf8 != data);
419 }
420 }
421 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200422 if (
423#if SIZEOF_WCHAR_T == 2
424 kind == PyUnicode_2BYTE_KIND
425#else
426 kind == PyUnicode_4BYTE_KIND
427#endif
428 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200429 {
430 assert(ascii->wstr == data);
431 assert(compact->wstr_length == ascii->length);
432 } else
433 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200434 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200435
436 if (compact->utf8 == NULL)
437 assert(compact->utf8_length == 0);
438 if (ascii->wstr == NULL)
439 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200440 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 /* check that the best kind is used */
442 if (check_content && kind != PyUnicode_WCHAR_KIND)
443 {
444 Py_ssize_t i;
445 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200446 void *data;
447 Py_UCS4 ch;
448
449 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200450 for (i=0; i < ascii->length; i++)
451 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200452 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 if (ch > maxchar)
454 maxchar = ch;
455 }
456 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100457 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200458 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 assert(maxchar <= 255);
460 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 else
462 assert(maxchar < 128);
463 }
Victor Stinner77faf692011-11-20 18:56:05 +0100464 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 assert(maxchar <= 0xFFFF);
467 }
468 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100470 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200472 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200473 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400474 return 1;
475}
Victor Stinner910337b2011-10-03 03:20:16 +0200476#endif
477
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478static PyObject*
479unicode_result_wchar(PyObject *unicode)
480{
481#ifndef Py_DEBUG
482 Py_ssize_t len;
483
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100484 len = _PyUnicode_WSTR_LENGTH(unicode);
485 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100486 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200487 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 }
489
490 if (len == 1) {
491 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100492 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100493 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
494 Py_DECREF(unicode);
495 return latin1_char;
496 }
497 }
498
499 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200500 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 return NULL;
502 }
503#else
Victor Stinneraa771272012-10-04 02:32:58 +0200504 assert(Py_REFCNT(unicode) == 1);
505
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100506 /* don't make the result ready in debug mode to ensure that the caller
507 makes the string ready before using it */
508 assert(_PyUnicode_CheckConsistency(unicode, 1));
509#endif
510 return unicode;
511}
512
513static PyObject*
514unicode_result_ready(PyObject *unicode)
515{
516 Py_ssize_t length;
517
518 length = PyUnicode_GET_LENGTH(unicode);
519 if (length == 0) {
520 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100521 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200522 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100523 }
524 return unicode_empty;
525 }
526
527 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200528 void *data = PyUnicode_DATA(unicode);
529 int kind = PyUnicode_KIND(unicode);
530 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100531 if (ch < 256) {
532 PyObject *latin1_char = unicode_latin1[ch];
533 if (latin1_char != NULL) {
534 if (unicode != latin1_char) {
535 Py_INCREF(latin1_char);
536 Py_DECREF(unicode);
537 }
538 return latin1_char;
539 }
540 else {
541 assert(_PyUnicode_CheckConsistency(unicode, 1));
542 Py_INCREF(unicode);
543 unicode_latin1[ch] = unicode;
544 return unicode;
545 }
546 }
547 }
548
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550 return unicode;
551}
552
553static PyObject*
554unicode_result(PyObject *unicode)
555{
556 assert(_PyUnicode_CHECK(unicode));
557 if (PyUnicode_IS_READY(unicode))
558 return unicode_result_ready(unicode);
559 else
560 return unicode_result_wchar(unicode);
561}
562
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563static PyObject*
564unicode_result_unchanged(PyObject *unicode)
565{
566 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500567 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100568 return NULL;
569 Py_INCREF(unicode);
570 return unicode;
571 }
572 else
573 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100574 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100575}
576
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200577/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
578 ASCII, Latin1, UTF-8, etc. */
579static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200580backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200581 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
582{
Victor Stinnerad771582015-10-09 12:38:53 +0200583 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200584 Py_UCS4 ch;
585 enum PyUnicode_Kind kind;
586 void *data;
587
588 assert(PyUnicode_IS_READY(unicode));
589 kind = PyUnicode_KIND(unicode);
590 data = PyUnicode_DATA(unicode);
591
592 size = 0;
593 /* determine replacement size */
594 for (i = collstart; i < collend; ++i) {
595 Py_ssize_t incr;
596
597 ch = PyUnicode_READ(kind, data, i);
598 if (ch < 0x100)
599 incr = 2+2;
600 else if (ch < 0x10000)
601 incr = 2+4;
602 else {
603 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200604 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605 }
606 if (size > PY_SSIZE_T_MAX - incr) {
607 PyErr_SetString(PyExc_OverflowError,
608 "encoded result is too long for a Python string");
609 return NULL;
610 }
611 size += incr;
612 }
613
Victor Stinnerad771582015-10-09 12:38:53 +0200614 str = _PyBytesWriter_Prepare(writer, str, size);
615 if (str == NULL)
616 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200617
618 /* generate replacement */
619 for (i = collstart; i < collend; ++i) {
620 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200621 *str++ = '\\';
622 if (ch >= 0x00010000) {
623 *str++ = 'U';
624 *str++ = Py_hexdigits[(ch>>28)&0xf];
625 *str++ = Py_hexdigits[(ch>>24)&0xf];
626 *str++ = Py_hexdigits[(ch>>20)&0xf];
627 *str++ = Py_hexdigits[(ch>>16)&0xf];
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200630 }
Victor Stinner797485e2015-10-09 03:17:30 +0200631 else if (ch >= 0x100) {
632 *str++ = 'u';
633 *str++ = Py_hexdigits[(ch>>12)&0xf];
634 *str++ = Py_hexdigits[(ch>>8)&0xf];
635 }
636 else
637 *str++ = 'x';
638 *str++ = Py_hexdigits[(ch>>4)&0xf];
639 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200640 }
641 return str;
642}
643
644/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
645 ASCII, Latin1, UTF-8, etc. */
646static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200647xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200648 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
649{
Victor Stinnerad771582015-10-09 12:38:53 +0200650 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 Py_UCS4 ch;
652 enum PyUnicode_Kind kind;
653 void *data;
654
655 assert(PyUnicode_IS_READY(unicode));
656 kind = PyUnicode_KIND(unicode);
657 data = PyUnicode_DATA(unicode);
658
659 size = 0;
660 /* determine replacement size */
661 for (i = collstart; i < collend; ++i) {
662 Py_ssize_t incr;
663
664 ch = PyUnicode_READ(kind, data, i);
665 if (ch < 10)
666 incr = 2+1+1;
667 else if (ch < 100)
668 incr = 2+2+1;
669 else if (ch < 1000)
670 incr = 2+3+1;
671 else if (ch < 10000)
672 incr = 2+4+1;
673 else if (ch < 100000)
674 incr = 2+5+1;
675 else if (ch < 1000000)
676 incr = 2+6+1;
677 else {
678 assert(ch <= MAX_UNICODE);
679 incr = 2+7+1;
680 }
681 if (size > PY_SSIZE_T_MAX - incr) {
682 PyErr_SetString(PyExc_OverflowError,
683 "encoded result is too long for a Python string");
684 return NULL;
685 }
686 size += incr;
687 }
688
Victor Stinnerad771582015-10-09 12:38:53 +0200689 str = _PyBytesWriter_Prepare(writer, str, size);
690 if (str == NULL)
691 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692
693 /* generate replacement */
694 for (i = collstart; i < collend; ++i) {
695 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
696 }
697 return str;
698}
699
Thomas Wouters477c8d52006-05-27 19:21:47 +0000700/* --- Bloom Filters ----------------------------------------------------- */
701
702/* stuff to implement simple "bloom filters" for Unicode characters.
703 to keep things simple, we use a single bitmask, using the least 5
704 bits from each unicode characters as the bit index. */
705
706/* the linebreak mask is set up by Unicode_Init below */
707
Antoine Pitrouf068f942010-01-13 14:19:12 +0000708#if LONG_BIT >= 128
709#define BLOOM_WIDTH 128
710#elif LONG_BIT >= 64
711#define BLOOM_WIDTH 64
712#elif LONG_BIT >= 32
713#define BLOOM_WIDTH 32
714#else
715#error "LONG_BIT is smaller than 32"
716#endif
717
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718#define BLOOM_MASK unsigned long
719
Serhiy Storchaka05997252013-01-26 12:14:02 +0200720static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000721
Antoine Pitrouf068f942010-01-13 14:19:12 +0000722#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000723
Benjamin Peterson29060642009-01-31 22:14:21 +0000724#define BLOOM_LINEBREAK(ch) \
725 ((ch) < 128U ? ascii_linebreak[(ch)] : \
726 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000727
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700728static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730{
Victor Stinnera85af502013-04-09 21:53:54 +0200731#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
732 do { \
733 TYPE *data = (TYPE *)PTR; \
734 TYPE *end = data + LEN; \
735 Py_UCS4 ch; \
736 for (; data != end; data++) { \
737 ch = *data; \
738 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
739 } \
740 break; \
741 } while (0)
742
Thomas Wouters477c8d52006-05-27 19:21:47 +0000743 /* calculate simple bloom-style bitmask for a given unicode string */
744
Antoine Pitrouf068f942010-01-13 14:19:12 +0000745 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000746
747 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200748 switch (kind) {
749 case PyUnicode_1BYTE_KIND:
750 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
751 break;
752 case PyUnicode_2BYTE_KIND:
753 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
754 break;
755 case PyUnicode_4BYTE_KIND:
756 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
757 break;
758 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700759 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200760 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000761 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200762
763#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764}
765
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300766static int
767ensure_unicode(PyObject *obj)
768{
769 if (!PyUnicode_Check(obj)) {
770 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200771 "must be str, not %.100s",
772 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300773 return -1;
774 }
775 return PyUnicode_READY(obj);
776}
777
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200778/* Compilation of templated routines */
779
780#include "stringlib/asciilib.h"
781#include "stringlib/fastsearch.h"
782#include "stringlib/partition.h"
783#include "stringlib/split.h"
784#include "stringlib/count.h"
785#include "stringlib/find.h"
786#include "stringlib/find_max_char.h"
787#include "stringlib/localeutil.h"
788#include "stringlib/undef.h"
789
790#include "stringlib/ucs1lib.h"
791#include "stringlib/fastsearch.h"
792#include "stringlib/partition.h"
793#include "stringlib/split.h"
794#include "stringlib/count.h"
795#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300796#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200797#include "stringlib/find_max_char.h"
798#include "stringlib/localeutil.h"
799#include "stringlib/undef.h"
800
801#include "stringlib/ucs2lib.h"
802#include "stringlib/fastsearch.h"
803#include "stringlib/partition.h"
804#include "stringlib/split.h"
805#include "stringlib/count.h"
806#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300807#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200808#include "stringlib/find_max_char.h"
809#include "stringlib/localeutil.h"
810#include "stringlib/undef.h"
811
812#include "stringlib/ucs4lib.h"
813#include "stringlib/fastsearch.h"
814#include "stringlib/partition.h"
815#include "stringlib/split.h"
816#include "stringlib/count.h"
817#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300818#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200819#include "stringlib/find_max_char.h"
820#include "stringlib/localeutil.h"
821#include "stringlib/undef.h"
822
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823#include "stringlib/unicodedefs.h"
824#include "stringlib/fastsearch.h"
825#include "stringlib/count.h"
826#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100827#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200828
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829/* --- Unicode Object ----------------------------------------------------- */
830
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700831static inline Py_ssize_t
832findchar(const void *s, int kind,
833 Py_ssize_t size, Py_UCS4 ch,
834 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200836 switch (kind) {
837 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200838 if ((Py_UCS1) ch != ch)
839 return -1;
840 if (direction > 0)
841 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
842 else
843 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200844 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200845 if ((Py_UCS2) ch != ch)
846 return -1;
847 if (direction > 0)
848 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
849 else
850 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200851 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200852 if (direction > 0)
853 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
854 else
855 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200856 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700857 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859}
860
Victor Stinnerafffce42012-10-03 23:03:17 +0200861#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000862/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200863 earlier.
864
865 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
866 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
867 invalid character in Unicode 6.0. */
868static void
869unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
870{
871 int kind = PyUnicode_KIND(unicode);
872 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
873 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
874 if (length <= old_length)
875 return;
876 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
877}
878#endif
879
Victor Stinnerfe226c02011-10-03 03:52:20 +0200880static PyObject*
881resize_compact(PyObject *unicode, Py_ssize_t length)
882{
883 Py_ssize_t char_size;
884 Py_ssize_t struct_size;
885 Py_ssize_t new_size;
886 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100887 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200888#ifdef Py_DEBUG
889 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
890#endif
891
Victor Stinner79891572012-05-03 13:43:07 +0200892 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200893 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100894 assert(PyUnicode_IS_COMPACT(unicode));
895
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200896 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100897 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200898 struct_size = sizeof(PyASCIIObject);
899 else
900 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200901 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902
Victor Stinnerfe226c02011-10-03 03:52:20 +0200903 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
904 PyErr_NoMemory();
905 return NULL;
906 }
907 new_size = (struct_size + (length + 1) * char_size);
908
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200909 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
910 PyObject_DEL(_PyUnicode_UTF8(unicode));
911 _PyUnicode_UTF8(unicode) = NULL;
912 _PyUnicode_UTF8_LENGTH(unicode) = 0;
913 }
Victor Stinner84def372011-12-11 20:04:56 +0100914 _Py_DEC_REFTOTAL;
915 _Py_ForgetReference(unicode);
916
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300917 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100918 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200920 PyErr_NoMemory();
921 return NULL;
922 }
Victor Stinner84def372011-12-11 20:04:56 +0100923 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100925
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200927 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200928 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100929 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200930 _PyUnicode_WSTR_LENGTH(unicode) = length;
931 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100932 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
933 PyObject_DEL(_PyUnicode_WSTR(unicode));
934 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100935 if (!PyUnicode_IS_ASCII(unicode))
936 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100937 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200938#ifdef Py_DEBUG
939 unicode_fill_invalid(unicode, old_length);
940#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
942 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200943 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 return unicode;
945}
946
Alexander Belopolsky40018472011-02-26 01:02:56 +0000947static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200948resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949{
Victor Stinner95663112011-10-04 01:03:50 +0200950 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100951 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000954
Victor Stinnerfe226c02011-10-03 03:52:20 +0200955 if (PyUnicode_IS_READY(unicode)) {
956 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200957 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200958 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200959#ifdef Py_DEBUG
960 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
961#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200962
963 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200964 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200965 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
966 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200967
968 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
969 PyErr_NoMemory();
970 return -1;
971 }
972 new_size = (length + 1) * char_size;
973
Victor Stinner7a9105a2011-12-12 00:13:42 +0100974 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
975 {
976 PyObject_DEL(_PyUnicode_UTF8(unicode));
977 _PyUnicode_UTF8(unicode) = NULL;
978 _PyUnicode_UTF8_LENGTH(unicode) = 0;
979 }
980
Victor Stinnerfe226c02011-10-03 03:52:20 +0200981 data = (PyObject *)PyObject_REALLOC(data, new_size);
982 if (data == NULL) {
983 PyErr_NoMemory();
984 return -1;
985 }
986 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200987 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200988 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200989 _PyUnicode_WSTR_LENGTH(unicode) = length;
990 }
991 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200992 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200993 _PyUnicode_UTF8_LENGTH(unicode) = length;
994 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 _PyUnicode_LENGTH(unicode) = length;
996 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200997#ifdef Py_DEBUG
998 unicode_fill_invalid(unicode, old_length);
999#endif
Victor Stinner95663112011-10-04 01:03:50 +02001000 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001001 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004 }
Victor Stinner95663112011-10-04 01:03:50 +02001005 assert(_PyUnicode_WSTR(unicode) != NULL);
1006
1007 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001008 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001009 PyErr_NoMemory();
1010 return -1;
1011 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001013 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001014 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (!wstr) {
1016 PyErr_NoMemory();
1017 return -1;
1018 }
1019 _PyUnicode_WSTR(unicode) = wstr;
1020 _PyUnicode_WSTR(unicode)[length] = 0;
1021 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001022 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 return 0;
1024}
1025
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026static PyObject*
1027resize_copy(PyObject *unicode, Py_ssize_t length)
1028{
1029 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001031 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001033 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001042 }
1043 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001044 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001046 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001052 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001053 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 }
1055}
1056
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060
1061 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001062 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064*/
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001069 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
Thomas Wouters477c8d52006-05-27 19:21:47 +00001072 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
1077
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001078 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 }
1087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001107 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001109 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111
Jeremy Hyltond8082792003-09-16 19:41:39 +00001112 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001121
Victor Stinner7931d9a2011-11-04 00:22:48 +01001122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 return unicode;
1124}
1125
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
Victor Stinner42dfd712011-10-03 14:41:45 +02001129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001135 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "ascii";
1155 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001156 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001157 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001158 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001159 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001160 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001161 default:
1162 return "<invalid compact kind>";
1163 }
1164}
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001192
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001200 else
1201 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001204
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera3b334d2011-10-03 13:53:37 +02001209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001226 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001227 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001234 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 }
1236
Victor Stinner9e9d6892011-10-04 01:02:02 +02001237 is_ascii = 0;
1238 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001241 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001247 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001251 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
Victor Stinner8f825062012-04-27 13:55:39 +02001304 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001309 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 else {
1312 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001313 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001314 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001316 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
Victor Stinner8f825062012-04-27 13:55:39 +02001327#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001328 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001329#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001337 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001343 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
Victor Stinner910337b2011-10-03 03:20:16 +02001348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 {
Victor Stinner551ac952011-11-29 22:58:13 +01001360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371}
1372#endif
1373
Victor Stinnercd9950f2011-10-02 00:34:53 +02001374static int
Victor Stinner488fa492011-12-12 00:01:39 +01001375unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001376{
Victor Stinner488fa492011-12-12 00:01:39 +01001377 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001378 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001379 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001380 return -1;
1381 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001382 return 0;
1383}
1384
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Victor Stinneree4544c2012-05-09 22:24:08 +02001393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001397 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
Victor Stinnerd3f08822012-05-29 12:57:52 +02001400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 if (how_many == 0)
1405 return 0;
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001408 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001410 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Victor Stinnerf1852262012-06-16 16:38:26 +02001412#ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424#endif
1425
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001426 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001437 }
Christian Heimesf051e432016-09-13 20:22:02 +02001438 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001451 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001452 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001471 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001472 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001507 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001551 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001555 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001559 if (how_many < 0) {
1560 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1561 return -1;
1562 }
1563 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1565 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001566 "Cannot write %zi characters at %zi "
1567 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001568 how_many, to_start, PyUnicode_GET_LENGTH(to));
1569 return -1;
1570 }
1571
1572 if (how_many == 0)
1573 return 0;
1574
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001576 return -1;
1577
1578 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1579 if (err) {
1580 PyErr_Format(PyExc_SystemError,
1581 "Cannot copy %s characters "
1582 "into a string of %s characters",
1583 unicode_kind_name(from),
1584 unicode_kind_name(to));
1585 return -1;
1586 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001587 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588}
1589
Victor Stinner17222162011-09-28 22:15:37 +02001590/* Find the maximum code point and count the number of surrogate pairs so a
1591 correct string length can be computed before converting a string to UCS4.
1592 This function counts single surrogates as a character and not as a pair.
1593
1594 Return 0 on success, or -1 on error. */
1595static int
1596find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1597 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598{
1599 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001600 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601
Victor Stinnerc53be962011-10-02 21:33:54 +02001602 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 *num_surrogates = 0;
1604 *maxchar = 0;
1605
1606 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001608 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1609 && (iter+1) < end
1610 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1611 {
1612 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1613 ++(*num_surrogates);
1614 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
1616 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001618 {
1619 ch = *iter;
1620 iter++;
1621 }
1622 if (ch > *maxchar) {
1623 *maxchar = ch;
1624 if (*maxchar > MAX_UNICODE) {
1625 PyErr_Format(PyExc_ValueError,
1626 "character U+%x is not in range [U+0000; U+10ffff]",
1627 ch);
1628 return -1;
1629 }
1630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 }
1632 return 0;
1633}
1634
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001635int
1636_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001637{
1638 wchar_t *end;
1639 Py_UCS4 maxchar = 0;
1640 Py_ssize_t num_surrogates;
1641#if SIZEOF_WCHAR_T == 2
1642 Py_ssize_t length_wo_surrogates;
1643#endif
1644
Georg Brandl7597add2011-10-05 16:36:47 +02001645 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001646 strings were created using _PyObject_New() and where no canonical
1647 representation (the str field) has been set yet aka strings
1648 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001649 assert(_PyUnicode_CHECK(unicode));
1650 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001652 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001653 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001654 /* Actually, it should neither be interned nor be anything else: */
1655 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001658 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001659 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661
1662 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001663 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1664 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 PyErr_NoMemory();
1666 return -1;
1667 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001668 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 _PyUnicode_WSTR(unicode), end,
1670 PyUnicode_1BYTE_DATA(unicode));
1671 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1672 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1673 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1674 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001675 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001676 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678 }
1679 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001680 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001681 _PyUnicode_UTF8(unicode) = NULL;
1682 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 }
1684 PyObject_FREE(_PyUnicode_WSTR(unicode));
1685 _PyUnicode_WSTR(unicode) = NULL;
1686 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1687 }
1688 /* In this case we might have to convert down from 4-byte native
1689 wchar_t to 2-byte unicode. */
1690 else if (maxchar < 65536) {
1691 assert(num_surrogates == 0 &&
1692 "FindMaxCharAndNumSurrogatePairs() messed up");
1693
Victor Stinner506f5922011-09-28 22:34:18 +02001694#if SIZEOF_WCHAR_T == 2
1695 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001696 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001697 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1698 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1699 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001700 _PyUnicode_UTF8(unicode) = NULL;
1701 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001702#else
1703 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001705 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001706 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001707 PyErr_NoMemory();
1708 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
Victor Stinner506f5922011-09-28 22:34:18 +02001710 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1711 _PyUnicode_WSTR(unicode), end,
1712 PyUnicode_2BYTE_DATA(unicode));
1713 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1714 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1715 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001716 _PyUnicode_UTF8(unicode) = NULL;
1717 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001718 PyObject_FREE(_PyUnicode_WSTR(unicode));
1719 _PyUnicode_WSTR(unicode) = NULL;
1720 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1721#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 }
1723 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1724 else {
1725#if SIZEOF_WCHAR_T == 2
1726 /* in case the native representation is 2-bytes, we need to allocate a
1727 new normalized 4-byte version. */
1728 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001729 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1730 PyErr_NoMemory();
1731 return -1;
1732 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001733 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1734 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 PyErr_NoMemory();
1736 return -1;
1737 }
1738 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1739 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001740 _PyUnicode_UTF8(unicode) = NULL;
1741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001742 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1743 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001744 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 PyObject_FREE(_PyUnicode_WSTR(unicode));
1746 _PyUnicode_WSTR(unicode) = NULL;
1747 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1748#else
1749 assert(num_surrogates == 0);
1750
Victor Stinnerc3c74152011-10-02 20:39:55 +02001751 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001753 _PyUnicode_UTF8(unicode) = NULL;
1754 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1756#endif
1757 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1758 }
1759 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001760 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 return 0;
1762}
1763
Alexander Belopolsky40018472011-02-26 01:02:56 +00001764static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001765unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766{
Walter Dörwald16807132007-05-25 13:52:07 +00001767 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001768 case SSTATE_NOT_INTERNED:
1769 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001770
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 case SSTATE_INTERNED_MORTAL:
1772 /* revive dead object temporarily for DelItem */
1773 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001774 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 Py_FatalError(
1776 "deletion of interned string failed");
1777 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001778
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 case SSTATE_INTERNED_IMMORTAL:
1780 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001781 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001782
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 default:
1784 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001785 }
1786
Victor Stinner03490912011-10-03 23:45:12 +02001787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001790 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001794 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795}
1796
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811}
1812#endif
1813
Alexander Belopolsky40018472011-02-26 01:02:56 +00001814static int
Victor Stinner488fa492011-12-12 00:01:39 +01001815unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001816{
Victor Stinner488fa492011-12-12 00:01:39 +01001817 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001826#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001829#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001830 return 1;
1831}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001832
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
Victor Stinner910337b2011-10-03 03:20:16 +02001846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001857 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001858 return 0;
1859 }
1860
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001865 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001866 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867 }
1868
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001872 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001873 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001875 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001876 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001881{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001894}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001895
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001897
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001904{
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001907 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001912#ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001920 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001921 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
Victor Stinner184252a2012-06-16 02:57:41 +02001928 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001932 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
Victor Stinner184252a2012-06-16 02:57:41 +02001940 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 }
1945 }
1946}
1947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
Victor Stinnera464fc12011-10-02 20:39:30 +02001951 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001953 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001957 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001961 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962}
1963
Victor Stinner985a82a2014-01-03 12:53:47 +01001964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
Victor Stinner985a82a2014-01-03 12:53:47 +01001974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001977
1978 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1979 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001980 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001981 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001982 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1983 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1984 }
1985 assert(_PyUnicode_CheckConsistency(unicode, 1));
1986 return unicode;
1987}
1988
Alexander Belopolsky40018472011-02-26 01:02:56 +00001989PyObject *
1990PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001992 if (u == NULL)
1993 return (PyObject*)_PyUnicode_New(size);
1994
1995 if (size < 0) {
1996 PyErr_BadInternalCall();
1997 return NULL;
1998 }
1999
2000 return PyUnicode_FromWideChar(u, size);
2001}
2002
2003PyObject *
2004PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2005{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002006 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 Py_UCS4 maxchar = 0;
2008 Py_ssize_t num_surrogates;
2009
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002010 if (u == NULL && size != 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 if (size == -1) {
2016 size = wcslen(u);
2017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002019 /* If the Unicode data is known at construction time, we can apply
2020 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002023 if (size == 0)
2024 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 /* Single character Unicode objects in the Latin-1 range are
2027 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002028 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 return get_latin1_char((unsigned char)*u);
2030
2031 /* If not empty and not single character, copy the Unicode data
2032 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002033 if (find_maxchar_surrogates(u, u + size,
2034 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return NULL;
2036
Victor Stinner8faf8212011-12-08 22:14:11 +01002037 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 if (!unicode)
2039 return NULL;
2040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 switch (PyUnicode_KIND(unicode)) {
2042 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002043 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2045 break;
2046 case PyUnicode_2BYTE_KIND:
2047#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002048 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002050 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2052#endif
2053 break;
2054 case PyUnicode_4BYTE_KIND:
2055#if SIZEOF_WCHAR_T == 2
2056 /* This is the only case which has to process surrogates, thus
2057 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002058 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059#else
2060 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002061 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062#endif
2063 break;
2064 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002065 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002068 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069}
2070
Alexander Belopolsky40018472011-02-26 01:02:56 +00002071PyObject *
2072PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002073{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002074 if (size < 0) {
2075 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002076 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002077 return NULL;
2078 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002079 if (u != NULL)
2080 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2081 else
2082 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002083}
2084
Alexander Belopolsky40018472011-02-26 01:02:56 +00002085PyObject *
2086PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002087{
2088 size_t size = strlen(u);
2089 if (size > PY_SSIZE_T_MAX) {
2090 PyErr_SetString(PyExc_OverflowError, "input too long");
2091 return NULL;
2092 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002093 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002094}
2095
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002096PyObject *
2097_PyUnicode_FromId(_Py_Identifier *id)
2098{
2099 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002100 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2101 strlen(id->string),
2102 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002103 if (!id->object)
2104 return NULL;
2105 PyUnicode_InternInPlace(&id->object);
2106 assert(!id->next);
2107 id->next = static_strings;
2108 static_strings = id;
2109 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002110 return id->object;
2111}
2112
2113void
2114_PyUnicode_ClearStaticStrings()
2115{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002116 _Py_Identifier *tmp, *s = static_strings;
2117 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002118 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002119 tmp = s->next;
2120 s->next = NULL;
2121 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002122 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002123 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002124}
2125
Benjamin Peterson0df54292012-03-26 14:50:32 -04002126/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002127
Victor Stinnerd3f08822012-05-29 12:57:52 +02002128PyObject*
2129_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002130{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002131 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002132 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002133 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002134#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002135 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002136#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002137 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002138 }
Victor Stinner785938e2011-12-11 20:09:03 +01002139 unicode = PyUnicode_New(size, 127);
2140 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002141 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002142 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2143 assert(_PyUnicode_CheckConsistency(unicode, 1));
2144 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002145}
2146
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002147static Py_UCS4
2148kind_maxchar_limit(unsigned int kind)
2149{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002150 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002151 case PyUnicode_1BYTE_KIND:
2152 return 0x80;
2153 case PyUnicode_2BYTE_KIND:
2154 return 0x100;
2155 case PyUnicode_4BYTE_KIND:
2156 return 0x10000;
2157 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002158 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002159 }
2160}
2161
Victor Stinner702c7342011-10-05 13:50:52 +02002162static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002163_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002166 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002167
Serhiy Storchaka678db842013-01-26 12:16:36 +02002168 if (size == 0)
2169 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002170 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002171 if (size == 1)
2172 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002173
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002174 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002175 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 if (!res)
2177 return NULL;
2178 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002179 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002181}
2182
Victor Stinnere57b1c02011-09-28 22:20:48 +02002183static PyObject*
2184_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185{
2186 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002187 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002188
Serhiy Storchaka678db842013-01-26 12:16:36 +02002189 if (size == 0)
2190 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002191 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002192 if (size == 1)
2193 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002194
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002195 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002196 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 if (!res)
2198 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002199 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002201 else {
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2204 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002205 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 return res;
2207}
2208
Victor Stinnere57b1c02011-09-28 22:20:48 +02002209static PyObject*
2210_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211{
2212 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002214
Serhiy Storchaka678db842013-01-26 12:16:36 +02002215 if (size == 0)
2216 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002217 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002218 if (size == 1)
2219 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002220
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002221 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002222 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (!res)
2224 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002225 if (max_char < 256)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2227 PyUnicode_1BYTE_DATA(res));
2228 else if (max_char < 0x10000)
2229 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2230 PyUnicode_2BYTE_DATA(res));
2231 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002233 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return res;
2235}
2236
2237PyObject*
2238PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2239{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002240 if (size < 0) {
2241 PyErr_SetString(PyExc_ValueError, "size must be positive");
2242 return NULL;
2243 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002244 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002246 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002248 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002250 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002252 PyErr_SetString(PyExc_SystemError, "invalid kind");
2253 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255}
2256
Victor Stinnerece58de2012-04-23 23:36:38 +02002257Py_UCS4
2258_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2259{
2260 enum PyUnicode_Kind kind;
2261 void *startptr, *endptr;
2262
2263 assert(PyUnicode_IS_READY(unicode));
2264 assert(0 <= start);
2265 assert(end <= PyUnicode_GET_LENGTH(unicode));
2266 assert(start <= end);
2267
2268 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2269 return PyUnicode_MAX_CHAR_VALUE(unicode);
2270
2271 if (start == end)
2272 return 127;
2273
Victor Stinner94d558b2012-04-27 22:26:58 +02002274 if (PyUnicode_IS_ASCII(unicode))
2275 return 127;
2276
Victor Stinnerece58de2012-04-23 23:36:38 +02002277 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002278 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002279 endptr = (char *)startptr + end * kind;
2280 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002281 switch(kind) {
2282 case PyUnicode_1BYTE_KIND:
2283 return ucs1lib_find_max_char(startptr, endptr);
2284 case PyUnicode_2BYTE_KIND:
2285 return ucs2lib_find_max_char(startptr, endptr);
2286 case PyUnicode_4BYTE_KIND:
2287 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002288 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002289 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002290 }
2291}
2292
Victor Stinner25a4b292011-10-06 12:31:55 +02002293/* Ensure that a string uses the most efficient storage, if it is not the
2294 case: create a new string with of the right kind. Write NULL into *p_unicode
2295 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002296static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002297unicode_adjust_maxchar(PyObject **p_unicode)
2298{
2299 PyObject *unicode, *copy;
2300 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002301 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002302 unsigned int kind;
2303
2304 assert(p_unicode != NULL);
2305 unicode = *p_unicode;
2306 assert(PyUnicode_IS_READY(unicode));
2307 if (PyUnicode_IS_ASCII(unicode))
2308 return;
2309
2310 len = PyUnicode_GET_LENGTH(unicode);
2311 kind = PyUnicode_KIND(unicode);
2312 if (kind == PyUnicode_1BYTE_KIND) {
2313 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002314 max_char = ucs1lib_find_max_char(u, u + len);
2315 if (max_char >= 128)
2316 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002317 }
2318 else if (kind == PyUnicode_2BYTE_KIND) {
2319 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs2lib_find_max_char(u, u + len);
2321 if (max_char >= 256)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
2324 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002325 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002326 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002327 max_char = ucs4lib_find_max_char(u, u + len);
2328 if (max_char >= 0x10000)
2329 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002330 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002331 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002332 if (copy != NULL)
2333 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002334 Py_DECREF(unicode);
2335 *p_unicode = copy;
2336}
2337
Victor Stinner034f6cf2011-09-30 02:26:44 +02002338PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002339_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002340{
Victor Stinner87af4f22011-11-21 23:03:47 +01002341 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002342 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002343
Victor Stinner034f6cf2011-09-30 02:26:44 +02002344 if (!PyUnicode_Check(unicode)) {
2345 PyErr_BadInternalCall();
2346 return NULL;
2347 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002348 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002349 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002350
Victor Stinner87af4f22011-11-21 23:03:47 +01002351 length = PyUnicode_GET_LENGTH(unicode);
2352 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002353 if (!copy)
2354 return NULL;
2355 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2356
Christian Heimesf051e432016-09-13 20:22:02 +02002357 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002358 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002359 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002360 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002361}
2362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363
Victor Stinnerbc603d12011-10-02 01:00:40 +02002364/* Widen Unicode objects to larger buffers. Don't write terminating null
2365 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366
2367void*
2368_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2369{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002370 Py_ssize_t len;
2371 void *result;
2372 unsigned int skind;
2373
Benjamin Petersonbac79492012-01-14 13:34:47 -05002374 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002375 return NULL;
2376
2377 len = PyUnicode_GET_LENGTH(s);
2378 skind = PyUnicode_KIND(s);
2379 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002380 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 return NULL;
2382 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002383 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002385 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002386 if (!result)
2387 return PyErr_NoMemory();
2388 assert(skind == PyUnicode_1BYTE_KIND);
2389 _PyUnicode_CONVERT_BYTES(
2390 Py_UCS1, Py_UCS2,
2391 PyUnicode_1BYTE_DATA(s),
2392 PyUnicode_1BYTE_DATA(s) + len,
2393 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002395 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002396 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002397 if (!result)
2398 return PyErr_NoMemory();
2399 if (skind == PyUnicode_2BYTE_KIND) {
2400 _PyUnicode_CONVERT_BYTES(
2401 Py_UCS2, Py_UCS4,
2402 PyUnicode_2BYTE_DATA(s),
2403 PyUnicode_2BYTE_DATA(s) + len,
2404 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002406 else {
2407 assert(skind == PyUnicode_1BYTE_KIND);
2408 _PyUnicode_CONVERT_BYTES(
2409 Py_UCS1, Py_UCS4,
2410 PyUnicode_1BYTE_DATA(s),
2411 PyUnicode_1BYTE_DATA(s) + len,
2412 result);
2413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002415 default:
2416 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 }
Victor Stinner01698042011-10-04 00:04:26 +02002418 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 return NULL;
2420}
2421
2422static Py_UCS4*
2423as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2424 int copy_null)
2425{
2426 int kind;
2427 void *data;
2428 Py_ssize_t len, targetlen;
2429 if (PyUnicode_READY(string) == -1)
2430 return NULL;
2431 kind = PyUnicode_KIND(string);
2432 data = PyUnicode_DATA(string);
2433 len = PyUnicode_GET_LENGTH(string);
2434 targetlen = len;
2435 if (copy_null)
2436 targetlen++;
2437 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002438 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (!target) {
2440 PyErr_NoMemory();
2441 return NULL;
2442 }
2443 }
2444 else {
2445 if (targetsize < targetlen) {
2446 PyErr_Format(PyExc_SystemError,
2447 "string is longer than the buffer");
2448 if (copy_null && 0 < targetsize)
2449 target[0] = 0;
2450 return NULL;
2451 }
2452 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002453 if (kind == PyUnicode_1BYTE_KIND) {
2454 Py_UCS1 *start = (Py_UCS1 *) data;
2455 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 else if (kind == PyUnicode_2BYTE_KIND) {
2458 Py_UCS2 *start = (Py_UCS2 *) data;
2459 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2460 }
2461 else {
2462 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002463 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 if (copy_null)
2466 target[len] = 0;
2467 return target;
2468}
2469
2470Py_UCS4*
2471PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2472 int copy_null)
2473{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002474 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 PyErr_BadInternalCall();
2476 return NULL;
2477 }
2478 return as_ucs4(string, target, targetsize, copy_null);
2479}
2480
2481Py_UCS4*
2482PyUnicode_AsUCS4Copy(PyObject *string)
2483{
2484 return as_ucs4(string, NULL, 0, 1);
2485}
2486
Victor Stinner15a11362012-10-06 23:48:20 +02002487/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002488 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2489 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2490#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002491
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002492static int
2493unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2494 Py_ssize_t width, Py_ssize_t precision)
2495{
2496 Py_ssize_t length, fill, arglen;
2497 Py_UCS4 maxchar;
2498
2499 if (PyUnicode_READY(str) == -1)
2500 return -1;
2501
2502 length = PyUnicode_GET_LENGTH(str);
2503 if ((precision == -1 || precision >= length)
2504 && width <= length)
2505 return _PyUnicodeWriter_WriteStr(writer, str);
2506
2507 if (precision != -1)
2508 length = Py_MIN(precision, length);
2509
2510 arglen = Py_MAX(length, width);
2511 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2512 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2513 else
2514 maxchar = writer->maxchar;
2515
2516 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2517 return -1;
2518
2519 if (width > length) {
2520 fill = width - length;
2521 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2522 return -1;
2523 writer->pos += fill;
2524 }
2525
2526 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2527 str, 0, length);
2528 writer->pos += length;
2529 return 0;
2530}
2531
2532static int
Victor Stinner998b8062018-09-12 00:23:25 +02002533unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002534 Py_ssize_t width, Py_ssize_t precision)
2535{
2536 /* UTF-8 */
2537 Py_ssize_t length;
2538 PyObject *unicode;
2539 int res;
2540
2541 length = strlen(str);
2542 if (precision != -1)
2543 length = Py_MIN(length, precision);
2544 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2545 if (unicode == NULL)
2546 return -1;
2547
2548 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2549 Py_DECREF(unicode);
2550 return res;
2551}
2552
Victor Stinner96865452011-03-01 23:44:09 +00002553static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002554unicode_fromformat_arg(_PyUnicodeWriter *writer,
2555 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002556{
Victor Stinnere215d962012-10-06 23:03:36 +02002557 const char *p;
2558 Py_ssize_t len;
2559 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002560 Py_ssize_t width;
2561 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002562 int longflag;
2563 int longlongflag;
2564 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002565 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002566
2567 p = f;
2568 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002569 zeropad = 0;
2570 if (*f == '0') {
2571 zeropad = 1;
2572 f++;
2573 }
Victor Stinner96865452011-03-01 23:44:09 +00002574
2575 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002576 width = -1;
2577 if (Py_ISDIGIT((unsigned)*f)) {
2578 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002579 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002580 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002581 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002582 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002584 return NULL;
2585 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002586 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002587 f++;
2588 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002589 }
2590 precision = -1;
2591 if (*f == '.') {
2592 f++;
2593 if (Py_ISDIGIT((unsigned)*f)) {
2594 precision = (*f - '0');
2595 f++;
2596 while (Py_ISDIGIT((unsigned)*f)) {
2597 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2598 PyErr_SetString(PyExc_ValueError,
2599 "precision too big");
2600 return NULL;
2601 }
2602 precision = (precision * 10) + (*f - '0');
2603 f++;
2604 }
2605 }
Victor Stinner96865452011-03-01 23:44:09 +00002606 if (*f == '%') {
2607 /* "%.3%s" => f points to "3" */
2608 f--;
2609 }
2610 }
2611 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002612 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002613 f--;
2614 }
Victor Stinner96865452011-03-01 23:44:09 +00002615
2616 /* Handle %ld, %lu, %lld and %llu. */
2617 longflag = 0;
2618 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002619 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002620 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002621 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002622 longflag = 1;
2623 ++f;
2624 }
Victor Stinner96865452011-03-01 23:44:09 +00002625 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002626 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002627 longlongflag = 1;
2628 f += 2;
2629 }
Victor Stinner96865452011-03-01 23:44:09 +00002630 }
2631 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002632 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002633 size_tflag = 1;
2634 ++f;
2635 }
Victor Stinnere215d962012-10-06 23:03:36 +02002636
2637 if (f[1] == '\0')
2638 writer->overallocate = 0;
2639
2640 switch (*f) {
2641 case 'c':
2642 {
2643 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002644 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002645 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002646 "character argument not in range(0x110000)");
2647 return NULL;
2648 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002649 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002650 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002651 break;
2652 }
2653
2654 case 'i':
2655 case 'd':
2656 case 'u':
2657 case 'x':
2658 {
2659 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002660 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002661 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002662
2663 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002664 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002665 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002666 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002667 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002668 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002669 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002670 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002671 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002672 va_arg(*vargs, size_t));
2673 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002674 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002675 va_arg(*vargs, unsigned int));
2676 }
2677 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002678 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002679 }
2680 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002681 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002682 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002683 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002684 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002685 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002686 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002687 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, Py_ssize_t));
2690 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002691 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002692 va_arg(*vargs, int));
2693 }
2694 assert(len >= 0);
2695
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (precision < len)
2697 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002698
2699 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002700 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2701 return NULL;
2702
Victor Stinnere215d962012-10-06 23:03:36 +02002703 if (width > precision) {
2704 Py_UCS4 fillchar;
2705 fill = width - precision;
2706 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002707 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2708 return NULL;
2709 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002710 }
Victor Stinner15a11362012-10-06 23:48:20 +02002711 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002712 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002713 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2714 return NULL;
2715 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002716 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717
Victor Stinner4a587072013-11-19 12:54:53 +01002718 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2719 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002720 break;
2721 }
2722
2723 case 'p':
2724 {
2725 char number[MAX_LONG_LONG_CHARS];
2726
2727 len = sprintf(number, "%p", va_arg(*vargs, void*));
2728 assert(len >= 0);
2729
2730 /* %p is ill-defined: ensure leading 0x. */
2731 if (number[1] == 'X')
2732 number[1] = 'x';
2733 else if (number[1] != 'x') {
2734 memmove(number + 2, number,
2735 strlen(number) + 1);
2736 number[0] = '0';
2737 number[1] = 'x';
2738 len += 2;
2739 }
2740
Victor Stinner4a587072013-11-19 12:54:53 +01002741 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002742 return NULL;
2743 break;
2744 }
2745
2746 case 's':
2747 {
2748 /* UTF-8 */
2749 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002750 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002751 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002752 break;
2753 }
2754
2755 case 'U':
2756 {
2757 PyObject *obj = va_arg(*vargs, PyObject *);
2758 assert(obj && _PyUnicode_CHECK(obj));
2759
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002760 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002761 return NULL;
2762 break;
2763 }
2764
2765 case 'V':
2766 {
2767 PyObject *obj = va_arg(*vargs, PyObject *);
2768 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002769 if (obj) {
2770 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002771 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002772 return NULL;
2773 }
2774 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002775 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002776 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002777 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002778 }
2779 break;
2780 }
2781
2782 case 'S':
2783 {
2784 PyObject *obj = va_arg(*vargs, PyObject *);
2785 PyObject *str;
2786 assert(obj);
2787 str = PyObject_Str(obj);
2788 if (!str)
2789 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002791 Py_DECREF(str);
2792 return NULL;
2793 }
2794 Py_DECREF(str);
2795 break;
2796 }
2797
2798 case 'R':
2799 {
2800 PyObject *obj = va_arg(*vargs, PyObject *);
2801 PyObject *repr;
2802 assert(obj);
2803 repr = PyObject_Repr(obj);
2804 if (!repr)
2805 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002807 Py_DECREF(repr);
2808 return NULL;
2809 }
2810 Py_DECREF(repr);
2811 break;
2812 }
2813
2814 case 'A':
2815 {
2816 PyObject *obj = va_arg(*vargs, PyObject *);
2817 PyObject *ascii;
2818 assert(obj);
2819 ascii = PyObject_ASCII(obj);
2820 if (!ascii)
2821 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002822 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002823 Py_DECREF(ascii);
2824 return NULL;
2825 }
2826 Py_DECREF(ascii);
2827 break;
2828 }
2829
2830 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002831 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002832 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002833 break;
2834
2835 default:
2836 /* if we stumble upon an unknown formatting code, copy the rest
2837 of the format string to the output string. (we cannot just
2838 skip the code, since there's no way to know what's in the
2839 argument list) */
2840 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002841 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002842 return NULL;
2843 f = p+len;
2844 return f;
2845 }
2846
2847 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002848 return f;
2849}
2850
Walter Dörwaldd2034312007-05-18 16:29:38 +00002851PyObject *
2852PyUnicode_FromFormatV(const char *format, va_list vargs)
2853{
Victor Stinnere215d962012-10-06 23:03:36 +02002854 va_list vargs2;
2855 const char *f;
2856 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002857
Victor Stinner8f674cc2013-04-17 23:02:17 +02002858 _PyUnicodeWriter_Init(&writer);
2859 writer.min_length = strlen(format) + 100;
2860 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002861
Benjamin Peterson0c212142016-09-20 20:39:33 -07002862 // Copy varags to be able to pass a reference to a subfunction.
2863 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002864
2865 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002866 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002867 f = unicode_fromformat_arg(&writer, f, &vargs2);
2868 if (f == NULL)
2869 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002871 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002872 const char *p;
2873 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002874
Victor Stinnere215d962012-10-06 23:03:36 +02002875 p = f;
2876 do
2877 {
2878 if ((unsigned char)*p > 127) {
2879 PyErr_Format(PyExc_ValueError,
2880 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2881 "string, got a non-ASCII byte: 0x%02x",
2882 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002883 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002884 }
2885 p++;
2886 }
2887 while (*p != '\0' && *p != '%');
2888 len = p - f;
2889
2890 if (*p == '\0')
2891 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002892
2893 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002894 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002895
2896 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002898 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002899 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002900 return _PyUnicodeWriter_Finish(&writer);
2901
2902 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002903 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002904 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002905 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002906}
2907
Walter Dörwaldd2034312007-05-18 16:29:38 +00002908PyObject *
2909PyUnicode_FromFormat(const char *format, ...)
2910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 PyObject* ret;
2912 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002913
2914#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002916#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002917 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 ret = PyUnicode_FromFormatV(format, vargs);
2920 va_end(vargs);
2921 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922}
2923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002924#ifdef HAVE_WCHAR_H
2925
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002926/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002927
Victor Stinnerd88d9832011-09-06 02:00:05 +02002928 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002929 character) required to convert the unicode object. Ignore size argument.
2930
Victor Stinnerd88d9832011-09-06 02:00:05 +02002931 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002932 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002933 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002934Py_ssize_t
2935PyUnicode_AsWideChar(PyObject *unicode,
2936 wchar_t *w,
2937 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002938{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002939 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 const wchar_t *wstr;
2941
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002942 if (unicode == NULL) {
2943 PyErr_BadInternalCall();
2944 return -1;
2945 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002946 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947 if (wstr == NULL)
2948 return -1;
2949
Victor Stinner5593d8a2010-10-02 11:11:27 +00002950 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002951 if (size > res)
2952 size = res + 1;
2953 else
2954 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002955 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956 return res;
2957 }
2958 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002960}
2961
Victor Stinner137c34c2010-09-29 10:25:54 +00002962wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002963PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002964 Py_ssize_t *size)
2965{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002966 const wchar_t *wstr;
2967 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002968 Py_ssize_t buflen;
2969
2970 if (unicode == NULL) {
2971 PyErr_BadInternalCall();
2972 return NULL;
2973 }
2974
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002975 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
2976 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002978 }
2979 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
2980 PyErr_SetString(PyExc_ValueError,
2981 "embedded null character");
2982 return NULL;
2983 }
2984
2985 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00002986 if (buffer == NULL) {
2987 PyErr_NoMemory();
2988 return NULL;
2989 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002990 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002991 if (size != NULL)
2992 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002993 return buffer;
2994}
2995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002996#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997
Alexander Belopolsky40018472011-02-26 01:02:56 +00002998PyObject *
2999PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003000{
Victor Stinner8faf8212011-12-08 22:14:11 +01003001 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 PyErr_SetString(PyExc_ValueError,
3003 "chr() arg not in range(0x110000)");
3004 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003005 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003006
Victor Stinner985a82a2014-01-03 12:53:47 +01003007 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003008}
3009
Alexander Belopolsky40018472011-02-26 01:02:56 +00003010PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003011PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003013 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003014 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003015 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003016 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003017 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 Py_INCREF(obj);
3019 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003020 }
3021 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 /* For a Unicode subtype that's not a Unicode object,
3023 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003024 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003025 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003026 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003027 "Can't convert '%.100s' object to str implicitly",
3028 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003029 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003030}
3031
Alexander Belopolsky40018472011-02-26 01:02:56 +00003032PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003033PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003034 const char *encoding,
3035 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003036{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003037 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003038 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003039
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 PyErr_BadInternalCall();
3042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003044
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003045 /* Decoding bytes objects is the most common case and should be fast */
3046 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003047 if (PyBytes_GET_SIZE(obj) == 0)
3048 _Py_RETURN_UNICODE_EMPTY();
3049 v = PyUnicode_Decode(
3050 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3051 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003052 return v;
3053 }
3054
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 PyErr_SetString(PyExc_TypeError,
3057 "decoding str is not supported");
3058 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003059 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003060
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3062 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3063 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003064 "decoding to str: need a bytes-like object, %.80s found",
3065 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003066 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003067 }
Tim Petersced69f82003-09-16 20:30:58 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003070 PyBuffer_Release(&buffer);
3071 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003073
Serhiy Storchaka05997252013-01-26 12:14:02 +02003074 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003075 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003076 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077}
3078
Victor Stinnerebe17e02016-10-12 13:57:45 +02003079/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3080 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3081 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003082int
3083_Py_normalize_encoding(const char *encoding,
3084 char *lower,
3085 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003087 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003088 char *l;
3089 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003090 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091
Victor Stinner942889a2016-09-05 15:40:10 -07003092 assert(encoding != NULL);
3093
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003094 e = encoding;
3095 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003096 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003097 punct = 0;
3098 while (1) {
3099 char c = *e;
3100 if (c == 0) {
3101 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003102 }
Victor Stinner942889a2016-09-05 15:40:10 -07003103
3104 if (Py_ISALNUM(c) || c == '.') {
3105 if (punct && l != lower) {
3106 if (l == l_end) {
3107 return 0;
3108 }
3109 *l++ = '_';
3110 }
3111 punct = 0;
3112
3113 if (l == l_end) {
3114 return 0;
3115 }
3116 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003117 }
3118 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003119 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003120 }
Victor Stinner942889a2016-09-05 15:40:10 -07003121
3122 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003123 }
3124 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003125 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003126}
3127
Alexander Belopolsky40018472011-02-26 01:02:56 +00003128PyObject *
3129PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003130 Py_ssize_t size,
3131 const char *encoding,
3132 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003133{
3134 PyObject *buffer = NULL, *unicode;
3135 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003136 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3137
3138 if (encoding == NULL) {
3139 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3140 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003141
Fred Drakee4315f52000-05-09 19:53:39 +00003142 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003143 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3144 char *lower = buflower;
3145
3146 /* Fast paths */
3147 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3148 lower += 3;
3149 if (*lower == '_') {
3150 /* Match "utf8" and "utf_8" */
3151 lower++;
3152 }
3153
3154 if (lower[0] == '8' && lower[1] == 0) {
3155 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3156 }
3157 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3158 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3159 }
3160 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3161 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3162 }
3163 }
3164 else {
3165 if (strcmp(lower, "ascii") == 0
3166 || strcmp(lower, "us_ascii") == 0) {
3167 return PyUnicode_DecodeASCII(s, size, errors);
3168 }
Steve Dowercc16be82016-09-08 10:35:16 -07003169 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003170 else if (strcmp(lower, "mbcs") == 0) {
3171 return PyUnicode_DecodeMBCS(s, size, errors);
3172 }
3173 #endif
3174 else if (strcmp(lower, "latin1") == 0
3175 || strcmp(lower, "latin_1") == 0
3176 || strcmp(lower, "iso_8859_1") == 0
3177 || strcmp(lower, "iso8859_1") == 0) {
3178 return PyUnicode_DecodeLatin1(s, size, errors);
3179 }
3180 }
Victor Stinner37296e82010-06-10 13:36:23 +00003181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
3183 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003184 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003185 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003186 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003187 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 if (buffer == NULL)
3189 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003190 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 if (unicode == NULL)
3192 goto onError;
3193 if (!PyUnicode_Check(unicode)) {
3194 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003195 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003196 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003197 encoding,
3198 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 Py_DECREF(unicode);
3200 goto onError;
3201 }
3202 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003203 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003204
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 Py_XDECREF(buffer);
3207 return NULL;
3208}
3209
Alexander Belopolsky40018472011-02-26 01:02:56 +00003210PyObject *
3211PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003212 const char *encoding,
3213 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003214{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003215 if (!PyUnicode_Check(unicode)) {
3216 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003217 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003218 }
3219
Serhiy Storchaka00939072016-10-27 21:05:49 +03003220 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3221 "PyUnicode_AsDecodedObject() is deprecated; "
3222 "use PyCodec_Decode() to decode from str", 1) < 0)
3223 return NULL;
3224
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003225 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003227
3228 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003229 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003230}
3231
Alexander Belopolsky40018472011-02-26 01:02:56 +00003232PyObject *
3233PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003234 const char *encoding,
3235 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003236{
3237 PyObject *v;
3238
3239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
3241 goto onError;
3242 }
3243
Serhiy Storchaka00939072016-10-27 21:05:49 +03003244 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245 "PyUnicode_AsDecodedUnicode() is deprecated; "
3246 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3247 return NULL;
3248
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003249 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003251
3252 /* Decode via the codec registry */
3253 v = PyCodec_Decode(unicode, encoding, errors);
3254 if (v == NULL)
3255 goto onError;
3256 if (!PyUnicode_Check(v)) {
3257 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003258 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003259 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003260 encoding,
3261 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003262 Py_DECREF(v);
3263 goto onError;
3264 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003265 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003266
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003268 return NULL;
3269}
3270
Alexander Belopolsky40018472011-02-26 01:02:56 +00003271PyObject *
3272PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003273 Py_ssize_t size,
3274 const char *encoding,
3275 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
3277 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003278
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003279 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3283 Py_DECREF(unicode);
3284 return v;
3285}
3286
Alexander Belopolsky40018472011-02-26 01:02:56 +00003287PyObject *
3288PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003289 const char *encoding,
3290 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003291{
3292 PyObject *v;
3293
3294 if (!PyUnicode_Check(unicode)) {
3295 PyErr_BadArgument();
3296 goto onError;
3297 }
3298
Serhiy Storchaka00939072016-10-27 21:05:49 +03003299 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3300 "PyUnicode_AsEncodedObject() is deprecated; "
3301 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3302 "or PyCodec_Encode() for generic encoding", 1) < 0)
3303 return NULL;
3304
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003305 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003307
3308 /* Encode via the codec registry */
3309 v = PyCodec_Encode(unicode, encoding, errors);
3310 if (v == NULL)
3311 goto onError;
3312 return v;
3313
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003315 return NULL;
3316}
3317
Victor Stinner1b579672011-12-17 05:47:23 +01003318
Victor Stinner2cba6b82018-01-10 22:46:15 +01003319static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003320unicode_encode_locale(PyObject *unicode, const char *errors,
3321 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003322{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003323 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003324
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003325 Py_ssize_t wlen;
3326 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3327 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003329 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003330
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003331 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003332 if (wlen2 != wlen) {
3333 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003334 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335 return NULL;
3336 }
3337
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003338 char *str;
3339 size_t error_pos;
3340 const char *reason;
3341 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003342 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003343 if (res != 0) {
3344 if (res == -2) {
3345 PyObject *exc;
3346 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3347 "locale", unicode,
3348 (Py_ssize_t)error_pos,
3349 (Py_ssize_t)(error_pos+1),
3350 reason);
3351 if (exc != NULL) {
3352 PyCodec_StrictErrors(exc);
3353 Py_DECREF(exc);
3354 }
3355 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003356 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003357 else if (res == -3) {
3358 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3359 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003360 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003361 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003362 PyMem_Free(wstr);
3363 return NULL;
3364 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003366 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003367
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003368 PyObject *bytes = PyBytes_FromString(str);
3369 PyMem_RawFree(str);
3370 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371}
3372
Victor Stinnerad158722010-10-27 00:25:46 +00003373PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003374PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3375{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003376 return unicode_encode_locale(unicode, errors, 1);
3377}
3378
3379PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003380PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003381{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003382 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003383 const _PyCoreConfig *config = &interp->core_config;
3384#if defined(__APPLE__)
3385 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3386#else
Victor Stinner793b5312011-04-27 00:24:21 +02003387 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3388 cannot use it to encode and decode filenames before it is loaded. Load
3389 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003390 implementation of the locale codec until the codec registry is
3391 initialized and the Python codec is loaded. See initfsencoding(). */
3392 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003393 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003394 config->filesystem_encoding,
3395 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003396 }
3397 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003398 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003399 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003400 }
Victor Stinnerad158722010-10-27 00:25:46 +00003401#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003402}
3403
Alexander Belopolsky40018472011-02-26 01:02:56 +00003404PyObject *
3405PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003406 const char *encoding,
3407 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408{
3409 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003410 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003411
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 if (!PyUnicode_Check(unicode)) {
3413 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 }
Fred Drakee4315f52000-05-09 19:53:39 +00003416
Victor Stinner942889a2016-09-05 15:40:10 -07003417 if (encoding == NULL) {
3418 return _PyUnicode_AsUTF8String(unicode, errors);
3419 }
3420
Fred Drakee4315f52000-05-09 19:53:39 +00003421 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003422 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3423 char *lower = buflower;
3424
3425 /* Fast paths */
3426 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3427 lower += 3;
3428 if (*lower == '_') {
3429 /* Match "utf8" and "utf_8" */
3430 lower++;
3431 }
3432
3433 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003434 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003435 }
3436 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3437 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3438 }
3439 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3440 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3441 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003442 }
Victor Stinner942889a2016-09-05 15:40:10 -07003443 else {
3444 if (strcmp(lower, "ascii") == 0
3445 || strcmp(lower, "us_ascii") == 0) {
3446 return _PyUnicode_AsASCIIString(unicode, errors);
3447 }
Steve Dowercc16be82016-09-08 10:35:16 -07003448#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003449 else if (strcmp(lower, "mbcs") == 0) {
3450 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3451 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003452#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003453 else if (strcmp(lower, "latin1") == 0 ||
3454 strcmp(lower, "latin_1") == 0 ||
3455 strcmp(lower, "iso_8859_1") == 0 ||
3456 strcmp(lower, "iso8859_1") == 0) {
3457 return _PyUnicode_AsLatin1String(unicode, errors);
3458 }
3459 }
Victor Stinner37296e82010-06-10 13:36:23 +00003460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461
3462 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003463 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003465 return NULL;
3466
3467 /* The normal path */
3468 if (PyBytes_Check(v))
3469 return v;
3470
3471 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003473 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003474 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003475
3476 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003477 "encoder %s returned bytearray instead of bytes; "
3478 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003479 encoding);
3480 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003481 Py_DECREF(v);
3482 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003483 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003484
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003485 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3486 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003487 Py_DECREF(v);
3488 return b;
3489 }
3490
3491 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003492 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003493 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003494 encoding,
3495 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003497 return NULL;
3498}
3499
Alexander Belopolsky40018472011-02-26 01:02:56 +00003500PyObject *
3501PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003502 const char *encoding,
3503 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504{
3505 PyObject *v;
3506
3507 if (!PyUnicode_Check(unicode)) {
3508 PyErr_BadArgument();
3509 goto onError;
3510 }
3511
Serhiy Storchaka00939072016-10-27 21:05:49 +03003512 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3513 "PyUnicode_AsEncodedUnicode() is deprecated; "
3514 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3515 return NULL;
3516
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003517 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003519
3520 /* Encode via the codec registry */
3521 v = PyCodec_Encode(unicode, encoding, errors);
3522 if (v == NULL)
3523 goto onError;
3524 if (!PyUnicode_Check(v)) {
3525 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003526 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003527 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003528 encoding,
3529 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003530 Py_DECREF(v);
3531 goto onError;
3532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003534
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 return NULL;
3537}
3538
Victor Stinner2cba6b82018-01-10 22:46:15 +01003539static PyObject*
3540unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3541 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003542{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003543 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003544
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003545 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3546 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547 return NULL;
3548 }
3549
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003550 wchar_t *wstr;
3551 size_t wlen;
3552 const char *reason;
3553 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003554 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003555 if (res != 0) {
3556 if (res == -2) {
3557 PyObject *exc;
3558 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3559 "locale", str, len,
3560 (Py_ssize_t)wlen,
3561 (Py_ssize_t)(wlen + 1),
3562 reason);
3563 if (exc != NULL) {
3564 PyCodec_StrictErrors(exc);
3565 Py_DECREF(exc);
3566 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003567 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003568 else if (res == -3) {
3569 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3570 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003571 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003572 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003573 }
Victor Stinner2f197072011-12-17 07:08:30 +01003574 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003575 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003576
3577 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3578 PyMem_RawFree(wstr);
3579 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003580}
3581
3582PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003583PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3584 const char *errors)
3585{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003586 return unicode_decode_locale(str, len, errors, 1);
3587}
3588
3589PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003590PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003591{
3592 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003593 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003594}
3595
3596
3597PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003598PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003599 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003600 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3601}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003602
Christian Heimes5894ba72007-11-04 11:43:14 +00003603PyObject*
3604PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3605{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003606 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003607 const _PyCoreConfig *config = &interp->core_config;
3608#if defined(__APPLE__)
3609 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3610#else
Victor Stinner793b5312011-04-27 00:24:21 +02003611 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3612 cannot use it to encode and decode filenames before it is loaded. Load
3613 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003614 implementation of the locale codec until the codec registry is
3615 initialized and the Python codec is loaded. See initfsencoding(). */
3616 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003617 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003618 config->filesystem_encoding,
3619 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003620 }
3621 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003622 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003623 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624 }
Victor Stinnerad158722010-10-27 00:25:46 +00003625#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003626}
3627
Martin v. Löwis011e8422009-05-05 04:43:17 +00003628
3629int
3630PyUnicode_FSConverter(PyObject* arg, void* addr)
3631{
Brett Cannonec6ce872016-09-06 15:50:29 -07003632 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003633 PyObject *output = NULL;
3634 Py_ssize_t size;
3635 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003636 if (arg == NULL) {
3637 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003638 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003639 return 1;
3640 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003641 path = PyOS_FSPath(arg);
3642 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003643 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003644 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003645 if (PyBytes_Check(path)) {
3646 output = path;
3647 }
3648 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3649 output = PyUnicode_EncodeFSDefault(path);
3650 Py_DECREF(path);
3651 if (!output) {
3652 return 0;
3653 }
3654 assert(PyBytes_Check(output));
3655 }
3656
Victor Stinner0ea2a462010-04-30 00:22:08 +00003657 size = PyBytes_GET_SIZE(output);
3658 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003659 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003660 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003661 Py_DECREF(output);
3662 return 0;
3663 }
3664 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003665 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003666}
3667
3668
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003669int
3670PyUnicode_FSDecoder(PyObject* arg, void* addr)
3671{
Brett Cannona5711202016-09-06 19:36:01 -07003672 int is_buffer = 0;
3673 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003674 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003675 if (arg == NULL) {
3676 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003677 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003678 return 1;
3679 }
Brett Cannona5711202016-09-06 19:36:01 -07003680
3681 is_buffer = PyObject_CheckBuffer(arg);
3682 if (!is_buffer) {
3683 path = PyOS_FSPath(arg);
3684 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003685 return 0;
3686 }
Brett Cannona5711202016-09-06 19:36:01 -07003687 }
3688 else {
3689 path = arg;
3690 Py_INCREF(arg);
3691 }
3692
3693 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003694 output = path;
3695 }
3696 else if (PyBytes_Check(path) || is_buffer) {
3697 PyObject *path_bytes = NULL;
3698
3699 if (!PyBytes_Check(path) &&
3700 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003701 "path should be string, bytes, or os.PathLike, not %.200s",
3702 Py_TYPE(arg)->tp_name)) {
3703 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003704 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003705 }
3706 path_bytes = PyBytes_FromObject(path);
3707 Py_DECREF(path);
3708 if (!path_bytes) {
3709 return 0;
3710 }
3711 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3712 PyBytes_GET_SIZE(path_bytes));
3713 Py_DECREF(path_bytes);
3714 if (!output) {
3715 return 0;
3716 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003717 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003718 else {
3719 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003720 "path should be string, bytes, or os.PathLike, not %.200s",
3721 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003722 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003723 return 0;
3724 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003725 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003726 Py_DECREF(output);
3727 return 0;
3728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003730 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003731 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003732 Py_DECREF(output);
3733 return 0;
3734 }
3735 *(PyObject**)addr = output;
3736 return Py_CLEANUP_SUPPORTED;
3737}
3738
3739
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003740const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003742{
Christian Heimesf3863112007-11-22 07:46:41 +00003743 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003745 if (!PyUnicode_Check(unicode)) {
3746 PyErr_BadArgument();
3747 return NULL;
3748 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003749 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003750 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003752 if (PyUnicode_UTF8(unicode) == NULL) {
3753 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003754 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 if (bytes == NULL)
3756 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3758 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003759 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 Py_DECREF(bytes);
3761 return NULL;
3762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003764 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 PyBytes_AS_STRING(bytes),
3766 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 Py_DECREF(bytes);
3768 }
3769
3770 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003771 *psize = PyUnicode_UTF8_LENGTH(unicode);
3772 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003773}
3774
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003775const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3779}
3780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781Py_UNICODE *
3782PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 const unsigned char *one_byte;
3785#if SIZEOF_WCHAR_T == 4
3786 const Py_UCS2 *two_bytes;
3787#else
3788 const Py_UCS4 *four_bytes;
3789 const Py_UCS4 *ucs4_end;
3790 Py_ssize_t num_surrogates;
3791#endif
3792 wchar_t *w;
3793 wchar_t *wchar_end;
3794
3795 if (!PyUnicode_Check(unicode)) {
3796 PyErr_BadArgument();
3797 return NULL;
3798 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003801 assert(_PyUnicode_KIND(unicode) != 0);
3802 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003806 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3807 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003808 num_surrogates = 0;
3809
3810 for (; four_bytes < ucs4_end; ++four_bytes) {
3811 if (*four_bytes > 0xFFFF)
3812 ++num_surrogates;
3813 }
3814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3816 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3817 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 PyErr_NoMemory();
3819 return NULL;
3820 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003821 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003823 w = _PyUnicode_WSTR(unicode);
3824 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3825 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3827 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003828 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003830 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3831 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 }
3833 else
3834 *w = *four_bytes;
3835
3836 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07003837 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 }
3839 }
3840 *w = 0;
3841#else
3842 /* sizeof(wchar_t) == 4 */
3843 Py_FatalError("Impossible unicode object state, wstr and str "
3844 "should share memory already.");
3845 return NULL;
3846#endif
3847 }
3848 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003849 if ((size_t)_PyUnicode_LENGTH(unicode) >
3850 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3851 PyErr_NoMemory();
3852 return NULL;
3853 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003854 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3855 (_PyUnicode_LENGTH(unicode) + 1));
3856 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 PyErr_NoMemory();
3858 return NULL;
3859 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3861 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3862 w = _PyUnicode_WSTR(unicode);
3863 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003865 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3866 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 for (; w < wchar_end; ++one_byte, ++w)
3868 *w = *one_byte;
3869 /* null-terminate the wstr */
3870 *w = 0;
3871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003874 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 for (; w < wchar_end; ++two_bytes, ++w)
3876 *w = *two_bytes;
3877 /* null-terminate the wstr */
3878 *w = 0;
3879#else
3880 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 PyObject_FREE(_PyUnicode_WSTR(unicode));
3882 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 Py_FatalError("Impossible unicode object state, wstr "
3884 "and str should share memory already.");
3885 return NULL;
3886#endif
3887 }
3888 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07003889 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 }
3891 }
3892 }
3893 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 *size = PyUnicode_WSTR_LENGTH(unicode);
3895 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003896}
3897
Alexander Belopolsky40018472011-02-26 01:02:56 +00003898Py_UNICODE *
3899PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902}
3903
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003904const Py_UNICODE *
3905_PyUnicode_AsUnicode(PyObject *unicode)
3906{
3907 Py_ssize_t size;
3908 const Py_UNICODE *wstr;
3909
3910 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3911 if (wstr && wcslen(wstr) != (size_t)size) {
3912 PyErr_SetString(PyExc_ValueError, "embedded null character");
3913 return NULL;
3914 }
3915 return wstr;
3916}
3917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Alexander Belopolsky40018472011-02-26 01:02:56 +00003919Py_ssize_t
3920PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921{
3922 if (!PyUnicode_Check(unicode)) {
3923 PyErr_BadArgument();
3924 goto onError;
3925 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003926 if (_PyUnicode_WSTR(unicode) == NULL) {
3927 if (PyUnicode_AsUnicode(unicode) == NULL)
3928 goto onError;
3929 }
3930 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 return -1;
3934}
3935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936Py_ssize_t
3937PyUnicode_GetLength(PyObject *unicode)
3938{
Victor Stinner07621332012-06-16 04:53:46 +02003939 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940 PyErr_BadArgument();
3941 return -1;
3942 }
Victor Stinner07621332012-06-16 04:53:46 +02003943 if (PyUnicode_READY(unicode) == -1)
3944 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 return PyUnicode_GET_LENGTH(unicode);
3946}
3947
3948Py_UCS4
3949PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3950{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003951 void *data;
3952 int kind;
3953
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003954 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003955 PyErr_BadArgument();
3956 return (Py_UCS4)-1;
3957 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003958 if (PyUnicode_READY(unicode) == -1) {
3959 return (Py_UCS4)-1;
3960 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003961 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003962 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 return (Py_UCS4)-1;
3964 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003965 data = PyUnicode_DATA(unicode);
3966 kind = PyUnicode_KIND(unicode);
3967 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968}
3969
3970int
3971PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3972{
3973 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003974 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 return -1;
3976 }
Victor Stinner488fa492011-12-12 00:01:39 +01003977 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003978 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003979 PyErr_SetString(PyExc_IndexError, "string index out of range");
3980 return -1;
3981 }
Victor Stinner488fa492011-12-12 00:01:39 +01003982 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003983 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003984 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3985 PyErr_SetString(PyExc_ValueError, "character out of range");
3986 return -1;
3987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3989 index, ch);
3990 return 0;
3991}
3992
Alexander Belopolsky40018472011-02-26 01:02:56 +00003993const char *
3994PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003995{
Victor Stinner42cb4622010-09-01 19:39:01 +00003996 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003997}
3998
Victor Stinner554f3f02010-06-16 23:33:54 +00003999/* create or adjust a UnicodeDecodeError */
4000static void
4001make_decode_exception(PyObject **exceptionObject,
4002 const char *encoding,
4003 const char *input, Py_ssize_t length,
4004 Py_ssize_t startpos, Py_ssize_t endpos,
4005 const char *reason)
4006{
4007 if (*exceptionObject == NULL) {
4008 *exceptionObject = PyUnicodeDecodeError_Create(
4009 encoding, input, length, startpos, endpos, reason);
4010 }
4011 else {
4012 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4013 goto onError;
4014 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4015 goto onError;
4016 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4017 goto onError;
4018 }
4019 return;
4020
4021onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004022 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004023}
4024
Steve Dowercc16be82016-09-08 10:35:16 -07004025#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026/* error handling callback helper:
4027 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004028 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 and adjust various state variables.
4030 return 0 on success, -1 on error
4031*/
4032
Alexander Belopolsky40018472011-02-26 01:02:56 +00004033static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004034unicode_decode_call_errorhandler_wchar(
4035 const char *errors, PyObject **errorHandler,
4036 const char *encoding, const char *reason,
4037 const char **input, const char **inend, Py_ssize_t *startinpos,
4038 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4039 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004041 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042
4043 PyObject *restuple = NULL;
4044 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004045 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004046 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004047 Py_ssize_t requiredsize;
4048 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004049 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004050 wchar_t *repwstr;
4051 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004053 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4054 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 *errorHandler = PyCodec_LookupError(errors);
4058 if (*errorHandler == NULL)
4059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 }
4061
Victor Stinner554f3f02010-06-16 23:33:54 +00004062 make_decode_exception(exceptionObject,
4063 encoding,
4064 *input, *inend - *input,
4065 *startinpos, *endinpos,
4066 reason);
4067 if (*exceptionObject == NULL)
4068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004070 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004072 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004074 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004077 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004079
4080 /* Copy back the bytes variables, which might have been modified by the
4081 callback */
4082 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4083 if (!inputobj)
4084 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004085 *input = PyBytes_AS_STRING(inputobj);
4086 insize = PyBytes_GET_SIZE(inputobj);
4087 *inend = *input + insize;
4088 /* we can DECREF safely, as the exception has another reference,
4089 so the object won't go away. */
4090 Py_DECREF(inputobj);
4091
4092 if (newpos<0)
4093 newpos = insize+newpos;
4094 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004095 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004096 goto onError;
4097 }
4098
4099 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4100 if (repwstr == NULL)
4101 goto onError;
4102 /* need more space? (at least enough for what we
4103 have+the replacement+the rest of the string (starting
4104 at the new input position), so we won't have to check space
4105 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004106 requiredsize = *outpos;
4107 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4108 goto overflow;
4109 requiredsize += repwlen;
4110 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4111 goto overflow;
4112 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004113 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004114 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004115 requiredsize = 2*outsize;
4116 if (unicode_resize(output, requiredsize) < 0)
4117 goto onError;
4118 }
4119 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4120 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004121 *endinpos = newpos;
4122 *inptr = *input + newpos;
4123
4124 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004125 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004126 return 0;
4127
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004128 overflow:
4129 PyErr_SetString(PyExc_OverflowError,
4130 "decoded result is too long for a Python string");
4131
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004132 onError:
4133 Py_XDECREF(restuple);
4134 return -1;
4135}
Steve Dowercc16be82016-09-08 10:35:16 -07004136#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004137
4138static int
4139unicode_decode_call_errorhandler_writer(
4140 const char *errors, PyObject **errorHandler,
4141 const char *encoding, const char *reason,
4142 const char **input, const char **inend, Py_ssize_t *startinpos,
4143 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4144 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4145{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004146 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004147
4148 PyObject *restuple = NULL;
4149 PyObject *repunicode = NULL;
4150 Py_ssize_t insize;
4151 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004152 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004153 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004155 int need_to_grow = 0;
4156 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004157
4158 if (*errorHandler == NULL) {
4159 *errorHandler = PyCodec_LookupError(errors);
4160 if (*errorHandler == NULL)
4161 goto onError;
4162 }
4163
4164 make_decode_exception(exceptionObject,
4165 encoding,
4166 *input, *inend - *input,
4167 *startinpos, *endinpos,
4168 reason);
4169 if (*exceptionObject == NULL)
4170 goto onError;
4171
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004172 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173 if (restuple == NULL)
4174 goto onError;
4175 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004176 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 goto onError;
4178 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004179 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004180 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004181
4182 /* Copy back the bytes variables, which might have been modified by the
4183 callback */
4184 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4185 if (!inputobj)
4186 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004187 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004188 *input = PyBytes_AS_STRING(inputobj);
4189 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004191 /* we can DECREF safely, as the exception has another reference,
4192 so the object won't go away. */
4193 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004194
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004197 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004198 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201
Victor Stinner170ca6f2013-04-18 00:25:28 +02004202 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004203 if (replen > 1) {
4204 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004205 need_to_grow = 1;
4206 }
4207 new_inptr = *input + newpos;
4208 if (*inend - new_inptr > remain) {
4209 /* We don't know the decoding algorithm here so we make the worst
4210 assumption that one byte decodes to one unicode character.
4211 If unfortunately one byte could decode to more unicode characters,
4212 the decoder may write out-of-bound then. Is it possible for the
4213 algorithms using this function? */
4214 writer->min_length += *inend - new_inptr - remain;
4215 need_to_grow = 1;
4216 }
4217 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004218 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004219 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004220 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4221 goto onError;
4222 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004223 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004224 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004227 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004230 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236}
4237
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004238/* --- UTF-7 Codec -------------------------------------------------------- */
4239
Antoine Pitrou244651a2009-05-04 18:56:13 +00004240/* See RFC2152 for details. We encode conservatively and decode liberally. */
4241
4242/* Three simple macros defining base-64. */
4243
4244/* Is c a base-64 character? */
4245
4246#define IS_BASE64(c) \
4247 (((c) >= 'A' && (c) <= 'Z') || \
4248 ((c) >= 'a' && (c) <= 'z') || \
4249 ((c) >= '0' && (c) <= '9') || \
4250 (c) == '+' || (c) == '/')
4251
4252/* given that c is a base-64 character, what is its base-64 value? */
4253
4254#define FROM_BASE64(c) \
4255 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4256 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4257 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4258 (c) == '+' ? 62 : 63)
4259
4260/* What is the base-64 character of the bottom 6 bits of n? */
4261
4262#define TO_BASE64(n) \
4263 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4264
4265/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4266 * decoded as itself. We are permissive on decoding; the only ASCII
4267 * byte not decoding to itself is the + which begins a base64
4268 * string. */
4269
4270#define DECODE_DIRECT(c) \
4271 ((c) <= 127 && (c) != '+')
4272
4273/* The UTF-7 encoder treats ASCII characters differently according to
4274 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4275 * the above). See RFC2152. This array identifies these different
4276 * sets:
4277 * 0 : "Set D"
4278 * alphanumeric and '(),-./:?
4279 * 1 : "Set O"
4280 * !"#$%&*;<=>@[]^_`{|}
4281 * 2 : "whitespace"
4282 * ht nl cr sp
4283 * 3 : special (must be base64 encoded)
4284 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4285 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286
Tim Petersced69f82003-09-16 20:30:58 +00004287static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004288char utf7_category[128] = {
4289/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4290 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4291/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4292 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4293/* sp ! " # $ % & ' ( ) * + , - . / */
4294 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4295/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4297/* @ A B C D E F G H I J K L M N O */
4298 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4299/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4301/* ` a b c d e f g h i j k l m n o */
4302 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4303/* p q r s t u v w x y z { | } ~ del */
4304 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305};
4306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307/* ENCODE_DIRECT: this character should be encoded as itself. The
4308 * answer depends on whether we are encoding set O as itself, and also
4309 * on whether we are encoding whitespace as itself. RFC2152 makes it
4310 * clear that the answers to these questions vary between
4311 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004312
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313#define ENCODE_DIRECT(c, directO, directWS) \
4314 ((c) < 128 && (c) > 0 && \
4315 ((utf7_category[(c)] == 0) || \
4316 (directWS && (utf7_category[(c)] == 2)) || \
4317 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Alexander Belopolsky40018472011-02-26 01:02:56 +00004319PyObject *
4320PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004321 Py_ssize_t size,
4322 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004324 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4325}
4326
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327/* The decoder. The only state we preserve is our read position,
4328 * i.e. how many characters we have consumed. So if we end in the
4329 * middle of a shift sequence we have to back off the read position
4330 * and the output to the beginning of the sequence, otherwise we lose
4331 * all the shift state (seen bits, number of bits seen, high
4332 * surrogate). */
4333
Alexander Belopolsky40018472011-02-26 01:02:56 +00004334PyObject *
4335PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004336 Py_ssize_t size,
4337 const char *errors,
4338 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004339{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004341 Py_ssize_t startinpos;
4342 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 const char *errmsg = "";
4346 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004347 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 unsigned int base64bits = 0;
4349 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004350 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 PyObject *errorHandler = NULL;
4352 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004354 if (size == 0) {
4355 if (consumed)
4356 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004357 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004358 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004361 _PyUnicodeWriter_Init(&writer);
4362 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363
4364 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 e = s + size;
4366
4367 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004370 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 if (inShift) { /* in a base-64 section */
4373 if (IS_BASE64(ch)) { /* consume a base-64 character */
4374 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4375 base64bits += 6;
4376 s++;
4377 if (base64bits >= 16) {
4378 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004379 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 base64bits -= 16;
4381 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004382 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 if (surrogate) {
4384 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004385 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4386 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004387 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004388 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004390 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 }
4392 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004393 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004394 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 }
4397 }
Victor Stinner551ac952011-11-29 22:58:13 +01004398 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 /* first surrogate */
4400 surrogate = outCh;
4401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004403 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004404 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 }
4406 }
4407 }
4408 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 if (base64bits > 0) { /* left-over bits */
4411 if (base64bits >= 6) {
4412 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004413 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 errmsg = "partial character in shift sequence";
4415 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 else {
4418 /* Some bits remain; they should be zero */
4419 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004420 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 errmsg = "non-zero padding bits in shift sequence";
4422 goto utf7Error;
4423 }
4424 }
4425 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004426 if (surrogate && DECODE_DIRECT(ch)) {
4427 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4428 goto onError;
4429 }
4430 surrogate = 0;
4431 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432 /* '-' is absorbed; other terminating
4433 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004434 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437 }
4438 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 s++; /* consume '+' */
4441 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004443 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004446 else if (s < e && !IS_BASE64(*s)) {
4447 s++;
4448 errmsg = "ill-formed sequence";
4449 goto utf7Error;
4450 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004453 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004454 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004456 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 }
4458 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004461 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 else {
4465 startinpos = s-starts;
4466 s++;
4467 errmsg = "unexpected special character";
4468 goto utf7Error;
4469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004473 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 errors, &errorHandler,
4475 "utf7", errmsg,
4476 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479 }
4480
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 /* end of string */
4482
4483 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4484 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004485 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 if (surrogate ||
4487 (base64bits >= 6) ||
4488 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004490 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 errors, &errorHandler,
4492 "utf7", "unterminated shift sequence",
4493 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004494 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 goto onError;
4496 if (s < e)
4497 goto restart;
4498 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500
4501 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004502 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004504 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004505 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004506 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004507 writer.kind, writer.data, shiftOutStart);
4508 Py_XDECREF(errorHandler);
4509 Py_XDECREF(exc);
4510 _PyUnicodeWriter_Dealloc(&writer);
4511 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004512 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004513 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 }
4515 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004516 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004518 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 Py_XDECREF(errorHandler);
4521 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004522 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 Py_XDECREF(errorHandler);
4526 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 return NULL;
4529}
4530
4531
Alexander Belopolsky40018472011-02-26 01:02:56 +00004532PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004533_PyUnicode_EncodeUTF7(PyObject *str,
4534 int base64SetO,
4535 int base64WhiteSpace,
4536 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004538 int kind;
4539 void *data;
4540 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004541 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004543 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 unsigned int base64bits = 0;
4545 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 char * out;
4547 char * start;
4548
Benjamin Petersonbac79492012-01-14 13:34:47 -05004549 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004550 return NULL;
4551 kind = PyUnicode_KIND(str);
4552 data = PyUnicode_DATA(str);
4553 len = PyUnicode_GET_LENGTH(str);
4554
4555 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004558 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004559 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004560 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004561 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 if (v == NULL)
4563 return NULL;
4564
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004565 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004566 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004567 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004568
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 if (inShift) {
4570 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4571 /* shifting out */
4572 if (base64bits) { /* output remaining bits */
4573 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4574 base64buffer = 0;
4575 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 }
4577 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 /* Characters not in the BASE64 set implicitly unshift the sequence
4579 so no '-' is required, except if the character is itself a '-' */
4580 if (IS_BASE64(ch) || ch == '-') {
4581 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 *out++ = (char) ch;
4584 }
4585 else {
4586 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004587 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 else { /* not in a shift sequence */
4590 if (ch == '+') {
4591 *out++ = '+';
4592 *out++ = '-';
4593 }
4594 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4595 *out++ = (char) ch;
4596 }
4597 else {
4598 *out++ = '+';
4599 inShift = 1;
4600 goto encode_char;
4601 }
4602 }
4603 continue;
4604encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004606 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004607
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 /* code first surrogate */
4609 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004610 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 while (base64bits >= 6) {
4612 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4613 base64bits -= 6;
4614 }
4615 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004616 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 base64bits += 16;
4619 base64buffer = (base64buffer << 16) | ch;
4620 while (base64bits >= 6) {
4621 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4622 base64bits -= 6;
4623 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (base64bits)
4626 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4627 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004629 if (_PyBytes_Resize(&v, out - start) < 0)
4630 return NULL;
4631 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004633PyObject *
4634PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4635 Py_ssize_t size,
4636 int base64SetO,
4637 int base64WhiteSpace,
4638 const char *errors)
4639{
4640 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004641 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004642 if (tmp == NULL)
4643 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004644 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004645 base64WhiteSpace, errors);
4646 Py_DECREF(tmp);
4647 return result;
4648}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650#undef IS_BASE64
4651#undef FROM_BASE64
4652#undef TO_BASE64
4653#undef DECODE_DIRECT
4654#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656/* --- UTF-8 Codec -------------------------------------------------------- */
4657
Alexander Belopolsky40018472011-02-26 01:02:56 +00004658PyObject *
4659PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004660 Py_ssize_t size,
4661 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662{
Walter Dörwald69652032004-09-07 20:24:22 +00004663 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4664}
4665
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666#include "stringlib/asciilib.h"
4667#include "stringlib/codecs.h"
4668#include "stringlib/undef.h"
4669
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004670#include "stringlib/ucs1lib.h"
4671#include "stringlib/codecs.h"
4672#include "stringlib/undef.h"
4673
4674#include "stringlib/ucs2lib.h"
4675#include "stringlib/codecs.h"
4676#include "stringlib/undef.h"
4677
4678#include "stringlib/ucs4lib.h"
4679#include "stringlib/codecs.h"
4680#include "stringlib/undef.h"
4681
Antoine Pitrouab868312009-01-10 15:40:25 +00004682/* Mask to quickly check whether a C 'long' contains a
4683 non-ASCII, UTF8-encoded char. */
4684#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004685# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004686#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004687# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004688#else
4689# error C 'long' size should be either 4 or 8!
4690#endif
4691
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004692static Py_ssize_t
4693ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004694{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004696 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004697
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004698 /*
4699 * Issue #17237: m68k is a bit different from most architectures in
4700 * that objects do not use "natural alignment" - for example, int and
4701 * long are only aligned at 2-byte boundaries. Therefore the assert()
4702 * won't work; also, tests have shown that skipping the "optimised
4703 * version" will even speed up m68k.
4704 */
4705#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004706#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004707 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4708 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004709 /* Fast path, see in STRINGLIB(utf8_decode) for
4710 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004711 /* Help allocation */
4712 const char *_p = p;
4713 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 while (_p < aligned_end) {
4715 unsigned long value = *(const unsigned long *) _p;
4716 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 *((unsigned long *)q) = value;
4719 _p += SIZEOF_LONG;
4720 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004721 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 p = _p;
4723 while (p < end) {
4724 if ((unsigned char)*p & 0x80)
4725 break;
4726 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004731#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 while (p < end) {
4733 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4734 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004735 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004736 /* Help allocation */
4737 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004738 while (_p < aligned_end) {
4739 unsigned long value = *(unsigned long *) _p;
4740 if (value & ASCII_CHAR_MASK)
4741 break;
4742 _p += SIZEOF_LONG;
4743 }
4744 p = _p;
4745 if (_p == end)
4746 break;
4747 }
4748 if ((unsigned char)*p & 0x80)
4749 break;
4750 ++p;
4751 }
4752 memcpy(dest, start, p - start);
4753 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754}
Antoine Pitrouab868312009-01-10 15:40:25 +00004755
Victor Stinner785938e2011-12-11 20:09:03 +01004756PyObject *
4757PyUnicode_DecodeUTF8Stateful(const char *s,
4758 Py_ssize_t size,
4759 const char *errors,
4760 Py_ssize_t *consumed)
4761{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004762 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004763 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765
4766 Py_ssize_t startinpos;
4767 Py_ssize_t endinpos;
4768 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004769 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004771 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004772
4773 if (size == 0) {
4774 if (consumed)
4775 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004776 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004777 }
4778
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4780 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004781 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 *consumed = 1;
4783 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004784 }
4785
Victor Stinner8f674cc2013-04-17 23:02:17 +02004786 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004787 writer.min_length = size;
4788 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004790
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004791 writer.pos = ascii_decode(s, end, writer.data);
4792 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004793 while (s < end) {
4794 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004795 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004796
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 if (PyUnicode_IS_ASCII(writer.buffer))
4799 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004801 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 } else {
4805 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004806 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 }
4808
4809 switch (ch) {
4810 case 0:
4811 if (s == end || consumed)
4812 goto End;
4813 errmsg = "unexpected end of data";
4814 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004815 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 break;
4817 case 1:
4818 errmsg = "invalid start byte";
4819 startinpos = s - starts;
4820 endinpos = startinpos + 1;
4821 break;
4822 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004823 case 3:
4824 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825 errmsg = "invalid continuation byte";
4826 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004827 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 break;
4829 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004830 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831 goto onError;
4832 continue;
4833 }
4834
Victor Stinner1d65d912015-10-05 13:43:50 +02004835 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004836 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004837
4838 switch (error_handler) {
4839 case _Py_ERROR_IGNORE:
4840 s += (endinpos - startinpos);
4841 break;
4842
4843 case _Py_ERROR_REPLACE:
4844 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4845 goto onError;
4846 s += (endinpos - startinpos);
4847 break;
4848
4849 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004850 {
4851 Py_ssize_t i;
4852
Victor Stinner1d65d912015-10-05 13:43:50 +02004853 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4854 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004855 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004856 ch = (Py_UCS4)(unsigned char)(starts[i]);
4857 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4858 ch + 0xdc00);
4859 writer.pos++;
4860 }
4861 s += (endinpos - startinpos);
4862 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004863 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004864
4865 default:
4866 if (unicode_decode_call_errorhandler_writer(
4867 errors, &error_handler_obj,
4868 "utf-8", errmsg,
4869 &starts, &end, &startinpos, &endinpos, &exc, &s,
4870 &writer))
4871 goto onError;
4872 }
Victor Stinner785938e2011-12-11 20:09:03 +01004873 }
4874
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876 if (consumed)
4877 *consumed = s - starts;
4878
Victor Stinner1d65d912015-10-05 13:43:50 +02004879 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004881 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882
4883onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004884 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004888}
4889
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004890
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004891/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4892 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004893
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004894 On success, write a pointer to a newly allocated wide character string into
4895 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4896 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004897
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004898 On memory allocation failure, return -1.
4899
4900 On decoding error (if surrogateescape is zero), return -2. If wlen is
4901 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4902 is not NULL, write the decoding error message into *reason. */
4903int
4904_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004905 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004906{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004907 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004908 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004909 wchar_t *unicode;
4910 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004911
Victor Stinner3d4226a2018-08-29 22:21:32 +02004912 int surrogateescape = 0;
4913 int surrogatepass = 0;
4914 switch (errors)
4915 {
4916 case _Py_ERROR_STRICT:
4917 break;
4918 case _Py_ERROR_SURROGATEESCAPE:
4919 surrogateescape = 1;
4920 break;
4921 case _Py_ERROR_SURROGATEPASS:
4922 surrogatepass = 1;
4923 break;
4924 default:
4925 return -3;
4926 }
4927
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004928 /* Note: size will always be longer than the resulting Unicode
4929 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004930 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004931 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004932 }
4933
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004934 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004935 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004936 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004937 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004938
4939 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004940 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004942 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004944#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004946#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004948#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 if (ch > 0xFF) {
4950#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004951 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004953 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004954 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4956 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4957#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004958 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004960 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004962 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004963
4964 if (surrogateescape) {
4965 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4966 }
4967 else {
4968 /* Is it a valid three-byte code? */
4969 if (surrogatepass
4970 && (e - s) >= 3
4971 && (s[0] & 0xf0) == 0xe0
4972 && (s[1] & 0xc0) == 0x80
4973 && (s[2] & 0xc0) == 0x80)
4974 {
4975 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4976 s += 3;
4977 unicode[outpos++] = ch;
4978 }
4979 else {
4980 PyMem_RawFree(unicode );
4981 if (reason != NULL) {
4982 switch (ch) {
4983 case 0:
4984 *reason = "unexpected end of data";
4985 break;
4986 case 1:
4987 *reason = "invalid start byte";
4988 break;
4989 /* 2, 3, 4 */
4990 default:
4991 *reason = "invalid continuation byte";
4992 break;
4993 }
4994 }
4995 if (wlen != NULL) {
4996 *wlen = s - orig_s;
4997 }
4998 return -2;
4999 }
5000 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005004 if (wlen) {
5005 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005006 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005007 *wstr = unicode;
5008 return 0;
5009}
5010
5011wchar_t*
5012_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5013{
5014 wchar_t *wstr;
5015 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5016 if (res != 0) {
5017 return NULL;
5018 }
5019 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005020}
5021
Antoine Pitrouab868312009-01-10 15:40:25 +00005022
Victor Stinnere47e6982017-12-21 15:45:16 +01005023/* UTF-8 encoder using the surrogateescape error handler .
5024
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005025 On success, return 0 and write the newly allocated character string (use
5026 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005027
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 On encoding failure, return -2 and write the position of the invalid
5029 surrogate character into *error_pos (if error_pos is set) and the decoding
5030 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On memory allocation failure, return -1. */
5033int
5034_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005035 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005036{
5037 const Py_ssize_t max_char_size = 4;
5038 Py_ssize_t len = wcslen(text);
5039
5040 assert(len >= 0);
5041
Victor Stinner3d4226a2018-08-29 22:21:32 +02005042 int surrogateescape = 0;
5043 int surrogatepass = 0;
5044 switch (errors)
5045 {
5046 case _Py_ERROR_STRICT:
5047 break;
5048 case _Py_ERROR_SURROGATEESCAPE:
5049 surrogateescape = 1;
5050 break;
5051 case _Py_ERROR_SURROGATEPASS:
5052 surrogatepass = 1;
5053 break;
5054 default:
5055 return -3;
5056 }
5057
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005058 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5059 return -1;
5060 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005061 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005062 if (raw_malloc) {
5063 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005064 }
5065 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005067 }
5068 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005069 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005070 }
5071
5072 char *p = bytes;
5073 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005074 for (i = 0; i < len; ) {
5075 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005077 i++;
5078#if Py_UNICODE_SIZE == 2
5079 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5080 && i < len
5081 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5082 {
5083 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5084 i++;
5085 }
5086#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005087
5088 if (ch < 0x80) {
5089 /* Encode ASCII */
5090 *p++ = (char) ch;
5091
5092 }
5093 else if (ch < 0x0800) {
5094 /* Encode Latin-1 */
5095 *p++ = (char)(0xc0 | (ch >> 6));
5096 *p++ = (char)(0x80 | (ch & 0x3f));
5097 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005098 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005099 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005100 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005101 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005102 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005103 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 if (reason != NULL) {
5105 *reason = "encoding error";
5106 }
5107 if (raw_malloc) {
5108 PyMem_RawFree(bytes);
5109 }
5110 else {
5111 PyMem_Free(bytes);
5112 }
5113 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005114 }
5115 *p++ = (char)(ch & 0xff);
5116 }
5117 else if (ch < 0x10000) {
5118 *p++ = (char)(0xe0 | (ch >> 12));
5119 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5120 *p++ = (char)(0x80 | (ch & 0x3f));
5121 }
5122 else { /* ch >= 0x10000 */
5123 assert(ch <= MAX_UNICODE);
5124 /* Encode UCS4 Unicode ordinals */
5125 *p++ = (char)(0xf0 | (ch >> 18));
5126 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5127 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5128 *p++ = (char)(0x80 | (ch & 0x3f));
5129 }
5130 }
5131 *p++ = '\0';
5132
5133 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005134 char *bytes2;
5135 if (raw_malloc) {
5136 bytes2 = PyMem_RawRealloc(bytes, final_size);
5137 }
5138 else {
5139 bytes2 = PyMem_Realloc(bytes, final_size);
5140 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005141 if (bytes2 == NULL) {
5142 if (error_pos != NULL) {
5143 *error_pos = (size_t)-1;
5144 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145 if (raw_malloc) {
5146 PyMem_RawFree(bytes);
5147 }
5148 else {
5149 PyMem_Free(bytes);
5150 }
5151 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005152 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005153 *str = bytes2;
5154 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005155}
5156
5157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005158/* Primary internal function which creates utf8 encoded bytes objects.
5159
5160 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005161 and allocate exactly as much space needed at the end. Else allocate the
5162 maximum possible needed (4 result bytes per Unicode character), and return
5163 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005164*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005165PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005166_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167{
Victor Stinner6099a032011-12-18 14:22:26 +01005168 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005169 void *data;
5170 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 if (!PyUnicode_Check(unicode)) {
5173 PyErr_BadArgument();
5174 return NULL;
5175 }
5176
5177 if (PyUnicode_READY(unicode) == -1)
5178 return NULL;
5179
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005180 if (PyUnicode_UTF8(unicode))
5181 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5182 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005183
5184 kind = PyUnicode_KIND(unicode);
5185 data = PyUnicode_DATA(unicode);
5186 size = PyUnicode_GET_LENGTH(unicode);
5187
Benjamin Petersonead6b532011-12-20 17:23:42 -06005188 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005189 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005190 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005191 case PyUnicode_1BYTE_KIND:
5192 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5193 assert(!PyUnicode_IS_ASCII(unicode));
5194 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5195 case PyUnicode_2BYTE_KIND:
5196 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5197 case PyUnicode_4BYTE_KIND:
5198 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200}
5201
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005203PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5204 Py_ssize_t size,
5205 const char *errors)
5206{
5207 PyObject *v, *unicode;
5208
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005209 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005210 if (unicode == NULL)
5211 return NULL;
5212 v = _PyUnicode_AsUTF8String(unicode, errors);
5213 Py_DECREF(unicode);
5214 return v;
5215}
5216
5217PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005218PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005220 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221}
5222
Walter Dörwald41980ca2007-08-16 21:55:45 +00005223/* --- UTF-32 Codec ------------------------------------------------------- */
5224
5225PyObject *
5226PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 Py_ssize_t size,
5228 const char *errors,
5229 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230{
5231 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5232}
5233
5234PyObject *
5235PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 Py_ssize_t size,
5237 const char *errors,
5238 int *byteorder,
5239 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005240{
5241 const char *starts = s;
5242 Py_ssize_t startinpos;
5243 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005245 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005246 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005247 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005249 PyObject *errorHandler = NULL;
5250 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005251
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 q = (unsigned char *)s;
5253 e = q + size;
5254
5255 if (byteorder)
5256 bo = *byteorder;
5257
5258 /* Check for BOM marks (U+FEFF) in the input and adjust current
5259 byte order setting accordingly. In native mode, the leading BOM
5260 mark is skipped, in all other modes, it is copied to the output
5261 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005262 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005263 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 if (bom == 0x0000FEFF) {
5265 bo = -1;
5266 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005268 else if (bom == 0xFFFE0000) {
5269 bo = 1;
5270 q += 4;
5271 }
5272 if (byteorder)
5273 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274 }
5275
Victor Stinnere64322e2012-10-30 23:12:47 +01005276 if (q == e) {
5277 if (consumed)
5278 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005279 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005280 }
5281
Victor Stinnere64322e2012-10-30 23:12:47 +01005282#ifdef WORDS_BIGENDIAN
5283 le = bo < 0;
5284#else
5285 le = bo <= 0;
5286#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005287 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005288
Victor Stinner8f674cc2013-04-17 23:02:17 +02005289 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005290 writer.min_length = (e - q + 3) / 4;
5291 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005293
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 while (1) {
5295 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005297
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 enum PyUnicode_Kind kind = writer.kind;
5300 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (le) {
5304 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005305 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 if (ch > maxch)
5307 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005308 if (kind != PyUnicode_1BYTE_KIND &&
5309 Py_UNICODE_IS_SURROGATE(ch))
5310 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005311 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005312 q += 4;
5313 } while (q <= last);
5314 }
5315 else {
5316 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005317 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005318 if (ch > maxch)
5319 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005320 if (kind != PyUnicode_1BYTE_KIND &&
5321 Py_UNICODE_IS_SURROGATE(ch))
5322 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 q += 4;
5325 } while (q <= last);
5326 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005327 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005328 }
5329
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005330 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005331 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005332 startinpos = ((const char *)q) - starts;
5333 endinpos = startinpos + 4;
5334 }
5335 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005338 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 startinpos = ((const char *)q) - starts;
5341 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 else {
5344 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005345 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 goto onError;
5347 q += 4;
5348 continue;
5349 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005350 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005351 startinpos = ((const char *)q) - starts;
5352 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005354
5355 /* The remaining input chars are ignored if the callback
5356 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005359 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005361 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363 }
5364
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368 Py_XDECREF(errorHandler);
5369 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374 Py_XDECREF(errorHandler);
5375 Py_XDECREF(exc);
5376 return NULL;
5377}
5378
5379PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005380_PyUnicode_EncodeUTF32(PyObject *str,
5381 const char *errors,
5382 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005383{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005384 enum PyUnicode_Kind kind;
5385 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005386 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005388 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005389#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005390 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005391#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005392 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005394 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 PyObject *errorHandler = NULL;
5397 PyObject *exc = NULL;
5398 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 if (!PyUnicode_Check(str)) {
5401 PyErr_BadArgument();
5402 return NULL;
5403 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005404 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005405 return NULL;
5406 kind = PyUnicode_KIND(str);
5407 data = PyUnicode_DATA(str);
5408 len = PyUnicode_GET_LENGTH(str);
5409
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005410 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005411 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005412 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005413 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414 if (v == NULL)
5415 return NULL;
5416
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 /* output buffer is 4-bytes aligned */
5418 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005419 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005422 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005424
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005425 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005426 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005427 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005428 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 else
5430 encoding = "utf-32";
5431
5432 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5434 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005435 }
5436
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437 pos = 0;
5438 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005439 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440
5441 if (kind == PyUnicode_2BYTE_KIND) {
5442 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5443 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005445 else {
5446 assert(kind == PyUnicode_4BYTE_KIND);
5447 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5448 &out, native_ordering);
5449 }
5450 if (pos == len)
5451 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005452
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 rep = unicode_encode_call_errorhandler(
5454 errors, &errorHandler,
5455 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005456 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005457 if (!rep)
5458 goto error;
5459
5460 if (PyBytes_Check(rep)) {
5461 repsize = PyBytes_GET_SIZE(rep);
5462 if (repsize & 3) {
5463 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005464 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 "surrogates not allowed");
5466 goto error;
5467 }
5468 moreunits = repsize / 4;
5469 }
5470 else {
5471 assert(PyUnicode_Check(rep));
5472 if (PyUnicode_READY(rep) < 0)
5473 goto error;
5474 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5475 if (!PyUnicode_IS_ASCII(rep)) {
5476 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005477 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 "surrogates not allowed");
5479 goto error;
5480 }
5481 }
5482
5483 /* four bytes are reserved for each surrogate */
5484 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005485 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005486 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 /* integer overflow */
5488 PyErr_NoMemory();
5489 goto error;
5490 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005491 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005492 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005493 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 }
5495
5496 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005497 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5502 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 }
5504
5505 Py_CLEAR(rep);
5506 }
5507
5508 /* Cut back to size actually needed. This is necessary for, for example,
5509 encoding of a string containing isolated surrogates and the 'ignore'
5510 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005511 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 if (nsize != PyBytes_GET_SIZE(v))
5513 _PyBytes_Resize(&v, nsize);
5514 Py_XDECREF(errorHandler);
5515 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005516 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005517 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005518 error:
5519 Py_XDECREF(rep);
5520 Py_XDECREF(errorHandler);
5521 Py_XDECREF(exc);
5522 Py_XDECREF(v);
5523 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524}
5525
Alexander Belopolsky40018472011-02-26 01:02:56 +00005526PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005527PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5528 Py_ssize_t size,
5529 const char *errors,
5530 int byteorder)
5531{
5532 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005533 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005534 if (tmp == NULL)
5535 return NULL;
5536 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5537 Py_DECREF(tmp);
5538 return result;
5539}
5540
5541PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005542PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005543{
Victor Stinnerb960b342011-11-20 19:12:52 +01005544 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005545}
5546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547/* --- UTF-16 Codec ------------------------------------------------------- */
5548
Tim Peters772747b2001-08-09 22:21:55 +00005549PyObject *
5550PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 Py_ssize_t size,
5552 const char *errors,
5553 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554{
Walter Dörwald69652032004-09-07 20:24:22 +00005555 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5556}
5557
5558PyObject *
5559PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 Py_ssize_t size,
5561 const char *errors,
5562 int *byteorder,
5563 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005566 Py_ssize_t startinpos;
5567 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005568 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005569 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005570 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005571 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005572 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 PyObject *errorHandler = NULL;
5574 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005575 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Tim Peters772747b2001-08-09 22:21:55 +00005577 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
5580 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005581 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005583 /* Check for BOM marks (U+FEFF) in the input and adjust current
5584 byte order setting accordingly. In native mode, the leading BOM
5585 mark is skipped, in all other modes, it is copied to the output
5586 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005587 if (bo == 0 && size >= 2) {
5588 const Py_UCS4 bom = (q[1] << 8) | q[0];
5589 if (bom == 0xFEFF) {
5590 q += 2;
5591 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005593 else if (bom == 0xFFFE) {
5594 q += 2;
5595 bo = 1;
5596 }
5597 if (byteorder)
5598 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Antoine Pitrou63065d72012-05-15 23:48:04 +02005601 if (q == e) {
5602 if (consumed)
5603 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005604 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005605 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606
Christian Heimes743e0cd2012-10-17 23:52:17 +02005607#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005609 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005610#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005613#endif
Tim Peters772747b2001-08-09 22:21:55 +00005614
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005616 character count normally. Error handler will take care of
5617 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005618 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005619 writer.min_length = (e - q + 1) / 2;
5620 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 while (1) {
5624 Py_UCS4 ch = 0;
5625 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005626 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005631 native_ordering);
5632 else
5633 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005634 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005635 native_ordering);
5636 } else if (kind == PyUnicode_2BYTE_KIND) {
5637 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 native_ordering);
5640 } else {
5641 assert(kind == PyUnicode_4BYTE_KIND);
5642 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005643 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005645 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005646 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647
Antoine Pitrou63065d72012-05-15 23:48:04 +02005648 switch (ch)
5649 {
5650 case 0:
5651 /* remaining byte at the end? (size should be even) */
5652 if (q == e || consumed)
5653 goto End;
5654 errmsg = "truncated data";
5655 startinpos = ((const char *)q) - starts;
5656 endinpos = ((const char *)e) - starts;
5657 break;
5658 /* The remaining input chars are ignored if the callback
5659 chooses to skip the input */
5660 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005661 q -= 2;
5662 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005663 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005664 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005665 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005666 endinpos = ((const char *)e) - starts;
5667 break;
5668 case 2:
5669 errmsg = "illegal encoding";
5670 startinpos = ((const char *)q) - 2 - starts;
5671 endinpos = startinpos + 2;
5672 break;
5673 case 3:
5674 errmsg = "illegal UTF-16 surrogate";
5675 startinpos = ((const char *)q) - 4 - starts;
5676 endinpos = startinpos + 2;
5677 break;
5678 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005679 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 continue;
5682 }
5683
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005685 errors,
5686 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005687 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005688 &starts,
5689 (const char **)&e,
5690 &startinpos,
5691 &endinpos,
5692 &exc,
5693 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 }
5697
Antoine Pitrou63065d72012-05-15 23:48:04 +02005698End:
Walter Dörwald69652032004-09-07 20:24:22 +00005699 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 Py_XDECREF(errorHandler);
5703 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005704 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 Py_XDECREF(errorHandler);
5709 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 return NULL;
5711}
5712
Tim Peters772747b2001-08-09 22:21:55 +00005713PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005714_PyUnicode_EncodeUTF16(PyObject *str,
5715 const char *errors,
5716 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 enum PyUnicode_Kind kind;
5719 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005721 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005723 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005724#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005726#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005727 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005728#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005729 const char *encoding;
5730 Py_ssize_t nsize, pos;
5731 PyObject *errorHandler = NULL;
5732 PyObject *exc = NULL;
5733 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005734
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 if (!PyUnicode_Check(str)) {
5736 PyErr_BadArgument();
5737 return NULL;
5738 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005739 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 return NULL;
5741 kind = PyUnicode_KIND(str);
5742 data = PyUnicode_DATA(str);
5743 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005744
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005745 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005746 if (kind == PyUnicode_4BYTE_KIND) {
5747 const Py_UCS4 *in = (const Py_UCS4 *)data;
5748 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005749 while (in < end) {
5750 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 }
5753 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005754 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 nsize = len + pairs + (byteorder == 0);
5759 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005764 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005765 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005766 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005768 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 }
5770 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005771 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 }
Tim Peters772747b2001-08-09 22:21:55 +00005773
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 if (kind == PyUnicode_1BYTE_KIND) {
5775 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5776 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005777 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005781 }
5782 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005784 }
5785 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005787 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005788
5789 pos = 0;
5790 while (pos < len) {
5791 Py_ssize_t repsize, moreunits;
5792
5793 if (kind == PyUnicode_2BYTE_KIND) {
5794 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5795 &out, native_ordering);
5796 }
5797 else {
5798 assert(kind == PyUnicode_4BYTE_KIND);
5799 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5800 &out, native_ordering);
5801 }
5802 if (pos == len)
5803 break;
5804
5805 rep = unicode_encode_call_errorhandler(
5806 errors, &errorHandler,
5807 encoding, "surrogates not allowed",
5808 str, &exc, pos, pos + 1, &pos);
5809 if (!rep)
5810 goto error;
5811
5812 if (PyBytes_Check(rep)) {
5813 repsize = PyBytes_GET_SIZE(rep);
5814 if (repsize & 1) {
5815 raise_encode_exception(&exc, encoding,
5816 str, pos - 1, pos,
5817 "surrogates not allowed");
5818 goto error;
5819 }
5820 moreunits = repsize / 2;
5821 }
5822 else {
5823 assert(PyUnicode_Check(rep));
5824 if (PyUnicode_READY(rep) < 0)
5825 goto error;
5826 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5827 if (!PyUnicode_IS_ASCII(rep)) {
5828 raise_encode_exception(&exc, encoding,
5829 str, pos - 1, pos,
5830 "surrogates not allowed");
5831 goto error;
5832 }
5833 }
5834
5835 /* two bytes are reserved for each surrogate */
5836 if (moreunits > 1) {
5837 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005838 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 /* integer overflow */
5840 PyErr_NoMemory();
5841 goto error;
5842 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005843 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 goto error;
5845 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5846 }
5847
5848 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005849 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005850 out += moreunits;
5851 } else /* rep is unicode */ {
5852 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5853 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5854 &out, native_ordering);
5855 }
5856
5857 Py_CLEAR(rep);
5858 }
5859
5860 /* Cut back to size actually needed. This is necessary for, for example,
5861 encoding of a string containing isolated surrogates and the 'ignore' handler
5862 is used. */
5863 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5864 if (nsize != PyBytes_GET_SIZE(v))
5865 _PyBytes_Resize(&v, nsize);
5866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005868 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005869 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005870 error:
5871 Py_XDECREF(rep);
5872 Py_XDECREF(errorHandler);
5873 Py_XDECREF(exc);
5874 Py_XDECREF(v);
5875 return NULL;
5876#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877}
5878
Alexander Belopolsky40018472011-02-26 01:02:56 +00005879PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5881 Py_ssize_t size,
5882 const char *errors,
5883 int byteorder)
5884{
5885 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005886 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887 if (tmp == NULL)
5888 return NULL;
5889 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5890 Py_DECREF(tmp);
5891 return result;
5892}
5893
5894PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005895PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898}
5899
5900/* --- Unicode Escape Codec ----------------------------------------------- */
5901
Fredrik Lundh06d12682001-01-24 07:59:11 +00005902static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005903
Alexander Belopolsky40018472011-02-26 01:02:56 +00005904PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005905_PyUnicode_DecodeUnicodeEscape(const char *s,
5906 Py_ssize_t size,
5907 const char *errors,
5908 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005911 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 PyObject *errorHandler = NULL;
5914 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005915
Eric V. Smith42454af2016-10-31 09:22:08 -04005916 // so we can remember if we've seen an invalid escape char or not
5917 *first_invalid_escape = NULL;
5918
Victor Stinner62ec3312016-09-06 17:04:34 -07005919 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005920 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005921 }
5922 /* Escaped strings will always be longer than the resulting
5923 Unicode string, so we start with size here and then reduce the
5924 length after conversion to the true value.
5925 (but if the error callback returns a long replacement string
5926 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005927 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005928 writer.min_length = size;
5929 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5930 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005931 }
5932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 end = s + size;
5934 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005935 unsigned char c = (unsigned char) *s++;
5936 Py_UCS4 ch;
5937 int count;
5938 Py_ssize_t startinpos;
5939 Py_ssize_t endinpos;
5940 const char *message;
5941
5942#define WRITE_ASCII_CHAR(ch) \
5943 do { \
5944 assert(ch <= 127); \
5945 assert(writer.pos < writer.size); \
5946 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5947 } while(0)
5948
5949#define WRITE_CHAR(ch) \
5950 do { \
5951 if (ch <= writer.maxchar) { \
5952 assert(writer.pos < writer.size); \
5953 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5954 } \
5955 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5956 goto onError; \
5957 } \
5958 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 if (c != '\\') {
5962 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 continue;
5964 }
5965
Victor Stinner62ec3312016-09-06 17:04:34 -07005966 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 if (s >= end) {
5969 message = "\\ at end of string";
5970 goto error;
5971 }
5972 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005975 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005978 case '\n': continue;
5979 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5980 case '\'': WRITE_ASCII_CHAR('\''); continue;
5981 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5982 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5985 case 't': WRITE_ASCII_CHAR('\t'); continue;
5986 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5987 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005989 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 case '0': case '1': case '2': case '3':
5995 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005997 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 ch = (ch<<3) + *s++ - '0';
5999 if (s < end && '0' <= *s && *s <= '7') {
6000 ch = (ch<<3) + *s++ - '0';
6001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 WRITE_CHAR(ch);
6004 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 /* hex escapes */
6007 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006009 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006010 message = "truncated \\xXX escape";
6011 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006015 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006016 message = "truncated \\uXXXX escape";
6017 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006020 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006021 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006022 message = "truncated \\UXXXXXXXX escape";
6023 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006025 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 ch <<= 4;
6027 if (c >= '0' && c <= '9') {
6028 ch += c - '0';
6029 }
6030 else if (c >= 'a' && c <= 'f') {
6031 ch += c - ('a' - 10);
6032 }
6033 else if (c >= 'A' && c <= 'F') {
6034 ch += c - ('A' - 10);
6035 }
6036 else {
6037 break;
6038 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006039 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006040 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006041 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006042 }
6043
6044 /* when we get here, ch is a 32-bit unicode character */
6045 if (ch > MAX_UNICODE) {
6046 message = "illegal Unicode character";
6047 goto error;
6048 }
6049
6050 WRITE_CHAR(ch);
6051 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006052
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006054 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055 if (ucnhash_CAPI == NULL) {
6056 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006057 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6058 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006059 if (ucnhash_CAPI == NULL) {
6060 PyErr_SetString(
6061 PyExc_UnicodeError,
6062 "\\N escapes not supported (can't load unicodedata module)"
6063 );
6064 goto onError;
6065 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006067
6068 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 const char *start = ++s;
6071 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 namelen = s - start;
6076 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006079 ch = 0xffffffff; /* in case 'getcode' messes up */
6080 if (namelen <= INT_MAX &&
6081 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6082 &ch, 0)) {
6083 assert(ch <= MAX_UNICODE);
6084 WRITE_CHAR(ch);
6085 continue;
6086 }
6087 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006088 }
6089 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006090 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091
6092 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006093 if (*first_invalid_escape == NULL) {
6094 *first_invalid_escape = s-1; /* Back up one char, since we've
6095 already incremented s. */
6096 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006097 WRITE_ASCII_CHAR('\\');
6098 WRITE_CHAR(c);
6099 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006101
6102 error:
6103 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006104 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006105 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006106 errors, &errorHandler,
6107 "unicodeescape", message,
6108 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006109 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006110 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006111 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006112 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006113
6114#undef WRITE_ASCII_CHAR
6115#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006117
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006118 Py_XDECREF(errorHandler);
6119 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006120 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006121
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006123 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124 Py_XDECREF(errorHandler);
6125 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 return NULL;
6127}
6128
Eric V. Smith42454af2016-10-31 09:22:08 -04006129PyObject *
6130PyUnicode_DecodeUnicodeEscape(const char *s,
6131 Py_ssize_t size,
6132 const char *errors)
6133{
6134 const char *first_invalid_escape;
6135 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6136 &first_invalid_escape);
6137 if (result == NULL)
6138 return NULL;
6139 if (first_invalid_escape != NULL) {
6140 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6141 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006142 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006143 Py_DECREF(result);
6144 return NULL;
6145 }
6146 }
6147 return result;
6148}
6149
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006150/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Alexander Belopolsky40018472011-02-26 01:02:56 +00006152PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161
Ezio Melottie7f90372012-10-05 03:33:31 +03006162 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006163 escape.
6164
Ezio Melottie7f90372012-10-05 03:33:31 +03006165 For UCS1 strings it's '\xxx', 4 bytes per source character.
6166 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6167 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006168 */
6169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 if (!PyUnicode_Check(unicode)) {
6171 PyErr_BadArgument();
6172 return NULL;
6173 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006174 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 }
Victor Stinner358af132015-10-12 22:36:57 +02006177
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 if (len == 0) {
6180 return PyBytes_FromStringAndSize(NULL, 0);
6181 }
6182
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 kind = PyUnicode_KIND(unicode);
6184 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006185 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6186 bytes, and 1 byte characters 4. */
6187 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006188 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 return PyErr_NoMemory();
6190 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006191 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 if (repr == NULL) {
6193 return NULL;
6194 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006197 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006198 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006199
Victor Stinner62ec3312016-09-06 17:04:34 -07006200 /* U+0000-U+00ff range */
6201 if (ch < 0x100) {
6202 if (ch >= ' ' && ch < 127) {
6203 if (ch != '\\') {
6204 /* Copy printable US ASCII as-is */
6205 *p++ = (char) ch;
6206 }
6207 /* Escape backslashes */
6208 else {
6209 *p++ = '\\';
6210 *p++ = '\\';
6211 }
6212 }
Victor Stinner358af132015-10-12 22:36:57 +02006213
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 /* Map special whitespace to '\t', \n', '\r' */
6215 else if (ch == '\t') {
6216 *p++ = '\\';
6217 *p++ = 't';
6218 }
6219 else if (ch == '\n') {
6220 *p++ = '\\';
6221 *p++ = 'n';
6222 }
6223 else if (ch == '\r') {
6224 *p++ = '\\';
6225 *p++ = 'r';
6226 }
6227
6228 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6229 else {
6230 *p++ = '\\';
6231 *p++ = 'x';
6232 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6233 *p++ = Py_hexdigits[ch & 0x000F];
6234 }
Tim Petersced69f82003-09-16 20:30:58 +00006235 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006236 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006237 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 *p++ = '\\';
6239 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006240 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6241 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6242 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6243 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6246 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006247
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 /* Make sure that the first two digits are zero */
6249 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006250 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 *p++ = 'U';
6252 *p++ = '0';
6253 *p++ = '0';
6254 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6259 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 assert(p - PyBytes_AS_STRING(repr) > 0);
6264 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6265 return NULL;
6266 }
6267 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268}
6269
Alexander Belopolsky40018472011-02-26 01:02:56 +00006270PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006271PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6272 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006274 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006275 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 }
6279
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006280 result = PyUnicode_AsUnicodeEscapeString(tmp);
6281 Py_DECREF(tmp);
6282 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283}
6284
6285/* --- Raw Unicode Escape Codec ------------------------------------------- */
6286
Alexander Belopolsky40018472011-02-26 01:02:56 +00006287PyObject *
6288PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006289 Py_ssize_t size,
6290 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006293 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 PyObject *errorHandler = NULL;
6296 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006297
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006299 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006301
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 /* Escaped strings will always be longer than the resulting
6303 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 length after conversion to the true value. (But decoding error
6305 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006306 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 writer.min_length = size;
6308 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6309 goto onError;
6310 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 end = s + size;
6313 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 unsigned char c = (unsigned char) *s++;
6315 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006316 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 Py_ssize_t startinpos;
6318 Py_ssize_t endinpos;
6319 const char *message;
6320
6321#define WRITE_CHAR(ch) \
6322 do { \
6323 if (ch <= writer.maxchar) { \
6324 assert(writer.pos < writer.size); \
6325 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6326 } \
6327 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6328 goto onError; \
6329 } \
6330 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 if (c != '\\' || s >= end) {
6334 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006337
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 c = (unsigned char) *s++;
6339 if (c == 'u') {
6340 count = 4;
6341 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 else if (c == 'U') {
6344 count = 8;
6345 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006346 }
6347 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 assert(writer.pos < writer.size);
6349 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6350 WRITE_CHAR(c);
6351 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006352 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006353 startinpos = s - starts - 2;
6354
6355 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6356 for (ch = 0; count && s < end; ++s, --count) {
6357 c = (unsigned char)*s;
6358 ch <<= 4;
6359 if (c >= '0' && c <= '9') {
6360 ch += c - '0';
6361 }
6362 else if (c >= 'a' && c <= 'f') {
6363 ch += c - ('a' - 10);
6364 }
6365 else if (c >= 'A' && c <= 'F') {
6366 ch += c - ('A' - 10);
6367 }
6368 else {
6369 break;
6370 }
6371 }
6372 if (!count) {
6373 if (ch <= MAX_UNICODE) {
6374 WRITE_CHAR(ch);
6375 continue;
6376 }
6377 message = "\\Uxxxxxxxx out of range";
6378 }
6379
6380 endinpos = s-starts;
6381 writer.min_length = end - s + writer.pos;
6382 if (unicode_decode_call_errorhandler_writer(
6383 errors, &errorHandler,
6384 "rawunicodeescape", message,
6385 &starts, &end, &startinpos, &endinpos, &exc, &s,
6386 &writer)) {
6387 goto onError;
6388 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006389 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006390
6391#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403}
6404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408{
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 int kind;
6413 void *data;
6414 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 if (!PyUnicode_Check(unicode)) {
6417 PyErr_BadArgument();
6418 return NULL;
6419 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 kind = PyUnicode_KIND(unicode);
6424 data = PyUnicode_DATA(unicode);
6425 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 if (kind == PyUnicode_1BYTE_KIND) {
6427 return PyBytes_FromStringAndSize(data, len);
6428 }
Victor Stinner0e368262011-11-10 20:12:49 +01006429
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6431 bytes, and 1 byte characters 4. */
6432 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006433
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 if (len > PY_SSIZE_T_MAX / expandsize) {
6435 return PyErr_NoMemory();
6436 }
6437 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6438 if (repr == NULL) {
6439 return NULL;
6440 }
6441 if (len == 0) {
6442 return repr;
6443 }
6444
6445 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 for (pos = 0; pos < len; pos++) {
6447 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006448
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6450 if (ch < 0x100) {
6451 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006452 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006453 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 *p++ = '\\';
6456 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006457 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6460 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006462 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6463 else {
6464 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6465 *p++ = '\\';
6466 *p++ = 'U';
6467 *p++ = '0';
6468 *p++ = '0';
6469 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6474 *p++ = Py_hexdigits[ch & 15];
6475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006477
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 assert(p > PyBytes_AS_STRING(repr));
6479 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6480 return NULL;
6481 }
6482 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483}
6484
Alexander Belopolsky40018472011-02-26 01:02:56 +00006485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006490 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006492 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6494 Py_DECREF(tmp);
6495 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006498/* --- Unicode Internal Codec ------------------------------------------- */
6499
Alexander Belopolsky40018472011-02-26 01:02:56 +00006500PyObject *
6501_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006502 Py_ssize_t size,
6503 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006504{
6505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006506 Py_ssize_t startinpos;
6507 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006508 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006509 const char *end;
6510 const char *reason;
6511 PyObject *errorHandler = NULL;
6512 PyObject *exc = NULL;
6513
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006514 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006515 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 1))
6517 return NULL;
6518
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006519 if (size < 0) {
6520 PyErr_BadInternalCall();
6521 return NULL;
6522 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006523 if (size == 0)
6524 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006525
Victor Stinner8f674cc2013-04-17 23:02:17 +02006526 _PyUnicodeWriter_Init(&writer);
6527 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6528 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006530 }
6531 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006536 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006537 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 endinpos = end-starts;
6539 reason = "truncated input";
6540 goto error;
6541 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006542 /* We copy the raw representation one byte at a time because the
6543 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006544 ((char *) &uch)[0] = s[0];
6545 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006546#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[2] = s[2];
6548 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006551#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 /* We have to sanity check the raw data, otherwise doom looms for
6553 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006554 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006555 endinpos = s - starts + Py_UNICODE_SIZE;
6556 reason = "illegal code point (> 0x10FFFF)";
6557 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006559#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006560 s += Py_UNICODE_SIZE;
6561#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006562 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006564 Py_UNICODE uch2;
6565 ((char *) &uch2)[0] = s[0];
6566 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006567 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006568 {
Victor Stinner551ac952011-11-29 22:58:13 +01006569 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006571 }
6572 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573#endif
6574
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006575 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006577 continue;
6578
6579 error:
6580 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006581 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006582 errors, &errorHandler,
6583 "unicode_internal", reason,
6584 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006585 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006586 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006587 }
6588
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006589 Py_XDECREF(errorHandler);
6590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006591 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595 Py_XDECREF(errorHandler);
6596 Py_XDECREF(exc);
6597 return NULL;
6598}
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600/* --- Latin-1 Codec ------------------------------------------------------ */
6601
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602PyObject *
6603PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006604 Py_ssize_t size,
6605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006608 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609}
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612static void
6613make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006615 PyObject *unicode,
6616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 const char *reason)
6618{
6619 if (*exceptionObject == NULL) {
6620 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006621 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006622 encoding, unicode, startpos, endpos, reason);
6623 }
6624 else {
6625 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6626 goto onError;
6627 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6628 goto onError;
6629 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6630 goto onError;
6631 return;
6632 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006633 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006634 }
6635}
6636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638static void
6639raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006641 PyObject *unicode,
6642 Py_ssize_t startpos, Py_ssize_t endpos,
6643 const char *reason)
6644{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006645 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006646 encoding, unicode, startpos, endpos, reason);
6647 if (*exceptionObject != NULL)
6648 PyCodec_StrictErrors(*exceptionObject);
6649}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650
6651/* error handling callback helper:
6652 build arguments, call the callback and check the arguments,
6653 put the result into newpos and return the replacement string, which
6654 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655static PyObject *
6656unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 PyObject **errorHandler,
6658 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 Py_ssize_t startpos, Py_ssize_t endpos,
6661 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006663 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 PyObject *restuple;
6666 PyObject *resunicode;
6667
6668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 }
6673
Benjamin Petersonbac79492012-01-14 13:34:47 -05006674 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675 return NULL;
6676 len = PyUnicode_GET_LENGTH(unicode);
6677
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006678 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006683 restuple = PyObject_CallFunctionObjArgs(
6684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006688 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 Py_DECREF(restuple);
6690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006692 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 &resunicode, newpos)) {
6694 Py_DECREF(restuple);
6695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006697 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6698 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6699 Py_DECREF(restuple);
6700 return NULL;
6701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 *newpos = len + *newpos;
6704 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006705 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 Py_DECREF(restuple);
6707 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_INCREF(resunicode);
6710 Py_DECREF(restuple);
6711 return resunicode;
6712}
6713
Alexander Belopolsky40018472011-02-26 01:02:56 +00006714static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006716 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006717 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 /* input state */
6720 Py_ssize_t pos=0, size;
6721 int kind;
6722 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 /* pointer into the output */
6724 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006725 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6726 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006727 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006729 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006730 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006731 /* output object */
6732 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Benjamin Petersonbac79492012-01-14 13:34:47 -05006734 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 return NULL;
6736 size = PyUnicode_GET_LENGTH(unicode);
6737 kind = PyUnicode_KIND(unicode);
6738 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 /* allocate enough for a simple encoding without
6740 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006741 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006742 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006743
6744 _PyBytesWriter_Init(&writer);
6745 str = _PyBytesWriter_Alloc(&writer, size);
6746 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006749 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006750 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006755 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006757 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006759 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006762 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006764
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006765 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006767
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006768 /* Only overallocate the buffer if it's not the last write */
6769 writer.overallocate = (collend < size);
6770
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006772 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006773 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006774
6775 switch (error_handler) {
6776 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006777 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006779
6780 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006781 memset(str, '?', collend - collstart);
6782 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006783 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006784 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 break;
Victor Stinner50149202015-09-22 00:26:54 +02006787
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006788 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006789 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006790 writer.min_size -= (collend - collstart);
6791 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006792 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006793 if (str == NULL)
6794 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 pos = collend;
6796 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006797
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006799 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006800 writer.min_size -= (collend - collstart);
6801 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006802 unicode, collstart, collend);
6803 if (str == NULL)
6804 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 break;
Victor Stinner50149202015-09-22 00:26:54 +02006807
Victor Stinnerc3713e92015-09-29 12:32:13 +02006808 case _Py_ERROR_SURROGATEESCAPE:
6809 for (i = collstart; i < collend; ++i) {
6810 ch = PyUnicode_READ(kind, data, i);
6811 if (ch < 0xdc80 || 0xdcff < ch) {
6812 /* Not a UTF-8b surrogate */
6813 break;
6814 }
6815 *str++ = (char)(ch - 0xdc00);
6816 ++pos;
6817 }
6818 if (i >= collend)
6819 break;
6820 collstart = pos;
6821 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006822 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006823
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006825 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6826 encoding, reason, unicode, &exc,
6827 collstart, collend, &newpos);
6828 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006830
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006831 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006832 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006833
Victor Stinner6bd525b2015-10-09 13:10:05 +02006834 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006835 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006836 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 PyBytes_AS_STRING(rep),
6838 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006839 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 else {
6841 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006842
Victor Stinner6bd525b2015-10-09 13:10:05 +02006843 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006846 if (limit == 256 ?
6847 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6848 !PyUnicode_IS_ASCII(rep))
6849 {
6850 /* Not all characters are smaller than limit */
6851 raise_encode_exception(&exc, encoding, unicode,
6852 collstart, collend, reason);
6853 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006855 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6856 str = _PyBytesWriter_WriteBytes(&writer, str,
6857 PyUnicode_DATA(rep),
6858 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006860 if (str == NULL)
6861 goto onError;
6862
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006863 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006864 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006865 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006866
6867 /* If overallocation was disabled, ensure that it was the last
6868 write. Otherwise, we missed an optimization */
6869 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006870 }
6871 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006872
Victor Stinner50149202015-09-22 00:26:54 +02006873 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006875 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006876
6877 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006878 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006879 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006880 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006881 Py_XDECREF(exc);
6882 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883}
6884
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006885/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006886PyObject *
6887PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006888 Py_ssize_t size,
6889 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006892 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 if (unicode == NULL)
6894 return NULL;
6895 result = unicode_encode_ucs1(unicode, errors, 256);
6896 Py_DECREF(unicode);
6897 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898}
6899
Alexander Belopolsky40018472011-02-26 01:02:56 +00006900PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902{
6903 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 PyErr_BadArgument();
6905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006907 if (PyUnicode_READY(unicode) == -1)
6908 return NULL;
6909 /* Fast path: if it is a one-byte string, construct
6910 bytes object directly. */
6911 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6912 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6913 PyUnicode_GET_LENGTH(unicode));
6914 /* Non-Latin-1 characters present. Defer to above function to
6915 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006916 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006917}
6918
6919PyObject*
6920PyUnicode_AsLatin1String(PyObject *unicode)
6921{
6922 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923}
6924
6925/* --- 7-bit ASCII Codec -------------------------------------------------- */
6926
Alexander Belopolsky40018472011-02-26 01:02:56 +00006927PyObject *
6928PyUnicode_DecodeASCII(const char *s,
6929 Py_ssize_t size,
6930 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006933 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006934 int kind;
6935 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006936 Py_ssize_t startinpos;
6937 Py_ssize_t endinpos;
6938 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006942 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006945 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006948 if (size == 1 && (unsigned char)s[0] < 128)
6949 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006950
Victor Stinner8f674cc2013-04-17 23:02:17 +02006951 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006952 writer.min_length = size;
6953 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006954 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006956 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006957 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006958 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 writer.pos = outpos;
6960 if (writer.pos == size)
6961 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006962
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006963 s += writer.pos;
6964 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006965 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006966 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006968 PyUnicode_WRITE(kind, data, writer.pos, c);
6969 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006971 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006973
6974 /* byte outsize range 0x00..0x7f: call the error handler */
6975
6976 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006977 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006978
6979 switch (error_handler)
6980 {
6981 case _Py_ERROR_REPLACE:
6982 case _Py_ERROR_SURROGATEESCAPE:
6983 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006984 but we may switch to UCS2 at the first write */
6985 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6986 goto onError;
6987 kind = writer.kind;
6988 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006989
6990 if (error_handler == _Py_ERROR_REPLACE)
6991 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6992 else
6993 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6994 writer.pos++;
6995 ++s;
6996 break;
6997
6998 case _Py_ERROR_IGNORE:
6999 ++s;
7000 break;
7001
7002 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 startinpos = s-starts;
7004 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007005 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007006 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 "ascii", "ordinal not in range(128)",
7008 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007011 kind = writer.kind;
7012 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007015 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007017 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007018
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007020 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007021 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007022 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 return NULL;
7024}
7025
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007026/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007027PyObject *
7028PyUnicode_EncodeASCII(const Py_UNICODE *p,
7029 Py_ssize_t size,
7030 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007033 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 if (unicode == NULL)
7035 return NULL;
7036 result = unicode_encode_ucs1(unicode, errors, 128);
7037 Py_DECREF(unicode);
7038 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
Alexander Belopolsky40018472011-02-26 01:02:56 +00007041PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007042_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
7044 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 PyErr_BadArgument();
7046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007048 if (PyUnicode_READY(unicode) == -1)
7049 return NULL;
7050 /* Fast path: if it is an ASCII-only string, construct bytes object
7051 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007052 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007053 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7054 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007055 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007056}
7057
7058PyObject *
7059PyUnicode_AsASCIIString(PyObject *unicode)
7060{
7061 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Steve Dowercc16be82016-09-08 10:35:16 -07007064#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007065
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007066/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007067
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007068#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069#define NEED_RETRY
7070#endif
7071
Victor Stinner3a50e702011-10-18 21:21:00 +02007072#ifndef WC_ERR_INVALID_CHARS
7073# define WC_ERR_INVALID_CHARS 0x0080
7074#endif
7075
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007076static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007077code_page_name(UINT code_page, PyObject **obj)
7078{
7079 *obj = NULL;
7080 if (code_page == CP_ACP)
7081 return "mbcs";
7082 if (code_page == CP_UTF7)
7083 return "CP_UTF7";
7084 if (code_page == CP_UTF8)
7085 return "CP_UTF8";
7086
7087 *obj = PyBytes_FromFormat("cp%u", code_page);
7088 if (*obj == NULL)
7089 return NULL;
7090 return PyBytes_AS_STRING(*obj);
7091}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092
Victor Stinner3a50e702011-10-18 21:21:00 +02007093static DWORD
7094decode_code_page_flags(UINT code_page)
7095{
7096 if (code_page == CP_UTF7) {
7097 /* The CP_UTF7 decoder only supports flags=0 */
7098 return 0;
7099 }
7100 else
7101 return MB_ERR_INVALID_CHARS;
7102}
7103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 * Decode a byte string from a Windows code page into unicode object in strict
7106 * mode.
7107 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007108 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7109 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007111static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007112decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007113 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 const char *in,
7115 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116{
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007118 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120
7121 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 assert(insize > 0);
7123 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7124 if (outsize <= 0)
7125 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126
7127 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007129 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007130 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 if (*v == NULL)
7132 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 }
7135 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007138 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141 }
7142
7143 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7145 if (outsize <= 0)
7146 goto error;
7147 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007148
Victor Stinner3a50e702011-10-18 21:21:00 +02007149error:
7150 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7151 return -2;
7152 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007153 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154}
7155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156/*
7157 * Decode a byte string from a code page into unicode object with an error
7158 * handler.
7159 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007160 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 * UnicodeDecodeError exception and returns -1 on error.
7162 */
7163static int
7164decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007165 PyObject **v,
7166 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007167 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007168{
7169 const char *startin = in;
7170 const char *endin = in + size;
7171 const DWORD flags = decode_code_page_flags(code_page);
7172 /* Ideally, we should get reason from FormatMessage. This is the Windows
7173 2000 English version of the message. */
7174 const char *reason = "No mapping for the Unicode character exists "
7175 "in the target code page.";
7176 /* each step cannot decode more than 1 character, but a character can be
7177 represented as a surrogate pair */
7178 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007179 int insize;
7180 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 PyObject *errorHandler = NULL;
7182 PyObject *exc = NULL;
7183 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007184 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 DWORD err;
7186 int ret = -1;
7187
7188 assert(size > 0);
7189
7190 encoding = code_page_name(code_page, &encoding_obj);
7191 if (encoding == NULL)
7192 return -1;
7193
Victor Stinner7d00cc12014-03-17 23:08:06 +01007194 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7196 UnicodeDecodeError. */
7197 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7198 if (exc != NULL) {
7199 PyCodec_StrictErrors(exc);
7200 Py_CLEAR(exc);
7201 }
7202 goto error;
7203 }
7204
7205 if (*v == NULL) {
7206 /* Create unicode object */
7207 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7208 PyErr_NoMemory();
7209 goto error;
7210 }
Victor Stinnerab595942011-12-17 04:59:06 +01007211 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007212 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 if (*v == NULL)
7214 goto error;
7215 startout = PyUnicode_AS_UNICODE(*v);
7216 }
7217 else {
7218 /* Extend unicode object */
7219 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7220 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7221 PyErr_NoMemory();
7222 goto error;
7223 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007224 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 goto error;
7226 startout = PyUnicode_AS_UNICODE(*v) + n;
7227 }
7228
7229 /* Decode the byte string character per character */
7230 out = startout;
7231 while (in < endin)
7232 {
7233 /* Decode a character */
7234 insize = 1;
7235 do
7236 {
7237 outsize = MultiByteToWideChar(code_page, flags,
7238 in, insize,
7239 buffer, Py_ARRAY_LENGTH(buffer));
7240 if (outsize > 0)
7241 break;
7242 err = GetLastError();
7243 if (err != ERROR_NO_UNICODE_TRANSLATION
7244 && err != ERROR_INSUFFICIENT_BUFFER)
7245 {
7246 PyErr_SetFromWindowsErr(0);
7247 goto error;
7248 }
7249 insize++;
7250 }
7251 /* 4=maximum length of a UTF-8 sequence */
7252 while (insize <= 4 && (in + insize) <= endin);
7253
7254 if (outsize <= 0) {
7255 Py_ssize_t startinpos, endinpos, outpos;
7256
Victor Stinner7d00cc12014-03-17 23:08:06 +01007257 /* last character in partial decode? */
7258 if (in + insize >= endin && !final)
7259 break;
7260
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 startinpos = in - startin;
7262 endinpos = startinpos + 1;
7263 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007264 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 errors, &errorHandler,
7266 encoding, reason,
7267 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007268 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 {
7270 goto error;
7271 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007272 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 }
7274 else {
7275 in += insize;
7276 memcpy(out, buffer, outsize * sizeof(wchar_t));
7277 out += outsize;
7278 }
7279 }
7280
7281 /* write a NUL character at the end */
7282 *out = 0;
7283
7284 /* Extend unicode object */
7285 outsize = out - startout;
7286 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007287 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007289 /* (in - startin) <= size and size is an int */
7290 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007291
7292error:
7293 Py_XDECREF(encoding_obj);
7294 Py_XDECREF(errorHandler);
7295 Py_XDECREF(exc);
7296 return ret;
7297}
7298
Victor Stinner3a50e702011-10-18 21:21:00 +02007299static PyObject *
7300decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007301 const char *s, Py_ssize_t size,
7302 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303{
Victor Stinner76a31a62011-11-04 00:05:13 +01007304 PyObject *v = NULL;
7305 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 if (code_page < 0) {
7308 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7309 return NULL;
7310 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007311 if (size < 0) {
7312 PyErr_BadInternalCall();
7313 return NULL;
7314 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007315
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 do
7320 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 if (size > INT_MAX) {
7323 chunk_size = INT_MAX;
7324 final = 0;
7325 done = 0;
7326 }
7327 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007329 {
7330 chunk_size = (int)size;
7331 final = (consumed == NULL);
7332 done = 1;
7333 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007334
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 if (chunk_size == 0 && done) {
7336 if (v != NULL)
7337 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007338 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 converted = decode_code_page_strict(code_page, &v,
7342 s, chunk_size);
7343 if (converted == -2)
7344 converted = decode_code_page_errors(code_page, &v,
7345 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007346 errors, final);
7347 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007348
7349 if (converted < 0) {
7350 Py_XDECREF(v);
7351 return NULL;
7352 }
7353
7354 if (consumed)
7355 *consumed += converted;
7356
7357 s += converted;
7358 size -= converted;
7359 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007360
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007361 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362}
7363
Alexander Belopolsky40018472011-02-26 01:02:56 +00007364PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007365PyUnicode_DecodeCodePageStateful(int code_page,
7366 const char *s,
7367 Py_ssize_t size,
7368 const char *errors,
7369 Py_ssize_t *consumed)
7370{
7371 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7372}
7373
7374PyObject *
7375PyUnicode_DecodeMBCSStateful(const char *s,
7376 Py_ssize_t size,
7377 const char *errors,
7378 Py_ssize_t *consumed)
7379{
7380 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7381}
7382
7383PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007384PyUnicode_DecodeMBCS(const char *s,
7385 Py_ssize_t size,
7386 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007387{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7389}
7390
Victor Stinner3a50e702011-10-18 21:21:00 +02007391static DWORD
7392encode_code_page_flags(UINT code_page, const char *errors)
7393{
7394 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007395 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 }
7397 else if (code_page == CP_UTF7) {
7398 /* CP_UTF7 only supports flags=0 */
7399 return 0;
7400 }
7401 else {
7402 if (errors != NULL && strcmp(errors, "replace") == 0)
7403 return 0;
7404 else
7405 return WC_NO_BEST_FIT_CHARS;
7406 }
7407}
7408
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007409/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 * Encode a Unicode string to a Windows code page into a byte string in strict
7411 * mode.
7412 *
7413 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007414 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007416static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007417encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007418 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007420{
Victor Stinner554f3f02010-06-16 23:33:54 +00007421 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 BOOL *pusedDefaultChar = &usedDefaultChar;
7423 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007424 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const DWORD flags = encode_code_page_flags(code_page, NULL);
7427 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007428 /* Create a substring so that we can get the UTF-16 representation
7429 of just the slice under consideration. */
7430 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431
Martin v. Löwis3d325192011-11-04 18:23:06 +01007432 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007433
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007435 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007437 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007438
Victor Stinner2fc507f2011-11-04 20:06:39 +01007439 substring = PyUnicode_Substring(unicode, offset, offset+len);
7440 if (substring == NULL)
7441 return -1;
7442 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7443 if (p == NULL) {
7444 Py_DECREF(substring);
7445 return -1;
7446 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007447 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007448
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007449 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007451 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 NULL, 0,
7453 NULL, pusedDefaultChar);
7454 if (outsize <= 0)
7455 goto error;
7456 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 if (pusedDefaultChar && *pusedDefaultChar) {
7458 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007461
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 if (*outbytes == NULL) {
7466 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470 }
7471 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 const Py_ssize_t n = PyBytes_Size(*outbytes);
7474 if (outsize > PY_SSIZE_T_MAX - n) {
7475 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7480 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484 }
7485
7486 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007488 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 out, outsize,
7490 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 if (outsize <= 0)
7493 goto error;
7494 if (pusedDefaultChar && *pusedDefaultChar)
7495 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007497
Victor Stinner3a50e702011-10-18 21:21:00 +02007498error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7501 return -2;
7502 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007503 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007504}
7505
Victor Stinner3a50e702011-10-18 21:21:00 +02007506/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007507 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 * error handler.
7509 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007510 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 * -1 on other error.
7512 */
7513static int
7514encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007515 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007517{
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 Py_ssize_t pos = unicode_offset;
7520 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 /* Ideally, we should get reason from FormatMessage. This is the Windows
7522 2000 English version of the message. */
7523 const char *reason = "invalid character";
7524 /* 4=maximum length of a UTF-8 sequence */
7525 char buffer[4];
7526 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7527 Py_ssize_t outsize;
7528 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 PyObject *errorHandler = NULL;
7530 PyObject *exc = NULL;
7531 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007532 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007533 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 PyObject *rep;
7535 int ret = -1;
7536
7537 assert(insize > 0);
7538
7539 encoding = code_page_name(code_page, &encoding_obj);
7540 if (encoding == NULL)
7541 return -1;
7542
7543 if (errors == NULL || strcmp(errors, "strict") == 0) {
7544 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7545 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007546 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 if (exc != NULL) {
7548 PyCodec_StrictErrors(exc);
7549 Py_DECREF(exc);
7550 }
7551 Py_XDECREF(encoding_obj);
7552 return -1;
7553 }
7554
7555 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7556 pusedDefaultChar = &usedDefaultChar;
7557 else
7558 pusedDefaultChar = NULL;
7559
7560 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7561 PyErr_NoMemory();
7562 goto error;
7563 }
7564 outsize = insize * Py_ARRAY_LENGTH(buffer);
7565
7566 if (*outbytes == NULL) {
7567 /* Create string object */
7568 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7569 if (*outbytes == NULL)
7570 goto error;
7571 out = PyBytes_AS_STRING(*outbytes);
7572 }
7573 else {
7574 /* Extend string object */
7575 Py_ssize_t n = PyBytes_Size(*outbytes);
7576 if (n > PY_SSIZE_T_MAX - outsize) {
7577 PyErr_NoMemory();
7578 goto error;
7579 }
7580 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7581 goto error;
7582 out = PyBytes_AS_STRING(*outbytes) + n;
7583 }
7584
7585 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007586 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007588 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7589 wchar_t chars[2];
7590 int charsize;
7591 if (ch < 0x10000) {
7592 chars[0] = (wchar_t)ch;
7593 charsize = 1;
7594 }
7595 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007596 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7597 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007598 charsize = 2;
7599 }
7600
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007602 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 buffer, Py_ARRAY_LENGTH(buffer),
7604 NULL, pusedDefaultChar);
7605 if (outsize > 0) {
7606 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7607 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007608 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 memcpy(out, buffer, outsize);
7610 out += outsize;
7611 continue;
7612 }
7613 }
7614 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7615 PyErr_SetFromWindowsErr(0);
7616 goto error;
7617 }
7618
Victor Stinner3a50e702011-10-18 21:21:00 +02007619 rep = unicode_encode_call_errorhandler(
7620 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007621 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 if (rep == NULL)
7624 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007625 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007626
7627 if (PyBytes_Check(rep)) {
7628 outsize = PyBytes_GET_SIZE(rep);
7629 if (outsize != 1) {
7630 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7631 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7632 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7633 Py_DECREF(rep);
7634 goto error;
7635 }
7636 out = PyBytes_AS_STRING(*outbytes) + offset;
7637 }
7638 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7639 out += outsize;
7640 }
7641 else {
7642 Py_ssize_t i;
7643 enum PyUnicode_Kind kind;
7644 void *data;
7645
Benjamin Petersonbac79492012-01-14 13:34:47 -05007646 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 Py_DECREF(rep);
7648 goto error;
7649 }
7650
7651 outsize = PyUnicode_GET_LENGTH(rep);
7652 if (outsize != 1) {
7653 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7654 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7655 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7656 Py_DECREF(rep);
7657 goto error;
7658 }
7659 out = PyBytes_AS_STRING(*outbytes) + offset;
7660 }
7661 kind = PyUnicode_KIND(rep);
7662 data = PyUnicode_DATA(rep);
7663 for (i=0; i < outsize; i++) {
7664 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7665 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007666 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007667 encoding, unicode,
7668 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 "unable to encode error handler result to ASCII");
7670 Py_DECREF(rep);
7671 goto error;
7672 }
7673 *out = (unsigned char)ch;
7674 out++;
7675 }
7676 }
7677 Py_DECREF(rep);
7678 }
7679 /* write a NUL byte */
7680 *out = 0;
7681 outsize = out - PyBytes_AS_STRING(*outbytes);
7682 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7683 if (_PyBytes_Resize(outbytes, outsize) < 0)
7684 goto error;
7685 ret = 0;
7686
7687error:
7688 Py_XDECREF(encoding_obj);
7689 Py_XDECREF(errorHandler);
7690 Py_XDECREF(exc);
7691 return ret;
7692}
7693
Victor Stinner3a50e702011-10-18 21:21:00 +02007694static PyObject *
7695encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007696 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 const char *errors)
7698{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007699 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007702 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007703
Victor Stinner29dacf22015-01-26 16:41:32 +01007704 if (!PyUnicode_Check(unicode)) {
7705 PyErr_BadArgument();
7706 return NULL;
7707 }
7708
Benjamin Petersonbac79492012-01-14 13:34:47 -05007709 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007710 return NULL;
7711 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007712
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 if (code_page < 0) {
7714 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7715 return NULL;
7716 }
7717
Martin v. Löwis3d325192011-11-04 18:23:06 +01007718 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007719 return PyBytes_FromStringAndSize(NULL, 0);
7720
Victor Stinner7581cef2011-11-03 22:32:33 +01007721 offset = 0;
7722 do
7723 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007724#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007725 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 chunks. */
7727 if (len > INT_MAX/2) {
7728 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007729 done = 0;
7730 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007731 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007732#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 done = 1;
7736 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007737
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007739 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 errors);
7741 if (ret == -2)
7742 ret = encode_code_page_errors(code_page, &outbytes,
7743 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 if (ret < 0) {
7746 Py_XDECREF(outbytes);
7747 return NULL;
7748 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007749
Victor Stinner7581cef2011-11-03 22:32:33 +01007750 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007753
Victor Stinner3a50e702011-10-18 21:21:00 +02007754 return outbytes;
7755}
7756
7757PyObject *
7758PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7759 Py_ssize_t size,
7760 const char *errors)
7761{
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007763 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 if (unicode == NULL)
7765 return NULL;
7766 res = encode_code_page(CP_ACP, unicode, errors);
7767 Py_DECREF(unicode);
7768 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007769}
7770
7771PyObject *
7772PyUnicode_EncodeCodePage(int code_page,
7773 PyObject *unicode,
7774 const char *errors)
7775{
Victor Stinner7581cef2011-11-03 22:32:33 +01007776 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007777}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007778
Alexander Belopolsky40018472011-02-26 01:02:56 +00007779PyObject *
7780PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007781{
Victor Stinner7581cef2011-11-03 22:32:33 +01007782 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007783}
7784
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007785#undef NEED_RETRY
7786
Steve Dowercc16be82016-09-08 10:35:16 -07007787#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007788
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789/* --- Character Mapping Codec -------------------------------------------- */
7790
Victor Stinnerfb161b12013-04-18 01:44:27 +02007791static int
7792charmap_decode_string(const char *s,
7793 Py_ssize_t size,
7794 PyObject *mapping,
7795 const char *errors,
7796 _PyUnicodeWriter *writer)
7797{
7798 const char *starts = s;
7799 const char *e;
7800 Py_ssize_t startinpos, endinpos;
7801 PyObject *errorHandler = NULL, *exc = NULL;
7802 Py_ssize_t maplen;
7803 enum PyUnicode_Kind mapkind;
7804 void *mapdata;
7805 Py_UCS4 x;
7806 unsigned char ch;
7807
7808 if (PyUnicode_READY(mapping) == -1)
7809 return -1;
7810
7811 maplen = PyUnicode_GET_LENGTH(mapping);
7812 mapdata = PyUnicode_DATA(mapping);
7813 mapkind = PyUnicode_KIND(mapping);
7814
7815 e = s + size;
7816
7817 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7818 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7819 * is disabled in encoding aliases, latin1 is preferred because
7820 * its implementation is faster. */
7821 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7822 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7823 Py_UCS4 maxchar = writer->maxchar;
7824
7825 assert (writer->kind == PyUnicode_1BYTE_KIND);
7826 while (s < e) {
7827 ch = *s;
7828 x = mapdata_ucs1[ch];
7829 if (x > maxchar) {
7830 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7831 goto onError;
7832 maxchar = writer->maxchar;
7833 outdata = (Py_UCS1 *)writer->data;
7834 }
7835 outdata[writer->pos] = x;
7836 writer->pos++;
7837 ++s;
7838 }
7839 return 0;
7840 }
7841
7842 while (s < e) {
7843 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7844 enum PyUnicode_Kind outkind = writer->kind;
7845 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7846 if (outkind == PyUnicode_1BYTE_KIND) {
7847 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7848 Py_UCS4 maxchar = writer->maxchar;
7849 while (s < e) {
7850 ch = *s;
7851 x = mapdata_ucs2[ch];
7852 if (x > maxchar)
7853 goto Error;
7854 outdata[writer->pos] = x;
7855 writer->pos++;
7856 ++s;
7857 }
7858 break;
7859 }
7860 else if (outkind == PyUnicode_2BYTE_KIND) {
7861 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7862 while (s < e) {
7863 ch = *s;
7864 x = mapdata_ucs2[ch];
7865 if (x == 0xFFFE)
7866 goto Error;
7867 outdata[writer->pos] = x;
7868 writer->pos++;
7869 ++s;
7870 }
7871 break;
7872 }
7873 }
7874 ch = *s;
7875
7876 if (ch < maplen)
7877 x = PyUnicode_READ(mapkind, mapdata, ch);
7878 else
7879 x = 0xfffe; /* invalid value */
7880Error:
7881 if (x == 0xfffe)
7882 {
7883 /* undefined mapping */
7884 startinpos = s-starts;
7885 endinpos = startinpos+1;
7886 if (unicode_decode_call_errorhandler_writer(
7887 errors, &errorHandler,
7888 "charmap", "character maps to <undefined>",
7889 &starts, &e, &startinpos, &endinpos, &exc, &s,
7890 writer)) {
7891 goto onError;
7892 }
7893 continue;
7894 }
7895
7896 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7897 goto onError;
7898 ++s;
7899 }
7900 Py_XDECREF(errorHandler);
7901 Py_XDECREF(exc);
7902 return 0;
7903
7904onError:
7905 Py_XDECREF(errorHandler);
7906 Py_XDECREF(exc);
7907 return -1;
7908}
7909
7910static int
7911charmap_decode_mapping(const char *s,
7912 Py_ssize_t size,
7913 PyObject *mapping,
7914 const char *errors,
7915 _PyUnicodeWriter *writer)
7916{
7917 const char *starts = s;
7918 const char *e;
7919 Py_ssize_t startinpos, endinpos;
7920 PyObject *errorHandler = NULL, *exc = NULL;
7921 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007922 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007923
7924 e = s + size;
7925
7926 while (s < e) {
7927 ch = *s;
7928
7929 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7930 key = PyLong_FromLong((long)ch);
7931 if (key == NULL)
7932 goto onError;
7933
7934 item = PyObject_GetItem(mapping, key);
7935 Py_DECREF(key);
7936 if (item == NULL) {
7937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7938 /* No mapping found means: mapping is undefined. */
7939 PyErr_Clear();
7940 goto Undefined;
7941 } else
7942 goto onError;
7943 }
7944
7945 /* Apply mapping */
7946 if (item == Py_None)
7947 goto Undefined;
7948 if (PyLong_Check(item)) {
7949 long value = PyLong_AS_LONG(item);
7950 if (value == 0xFFFE)
7951 goto Undefined;
7952 if (value < 0 || value > MAX_UNICODE) {
7953 PyErr_Format(PyExc_TypeError,
7954 "character mapping must be in range(0x%lx)",
7955 (unsigned long)MAX_UNICODE + 1);
7956 goto onError;
7957 }
7958
7959 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7960 goto onError;
7961 }
7962 else if (PyUnicode_Check(item)) {
7963 if (PyUnicode_READY(item) == -1)
7964 goto onError;
7965 if (PyUnicode_GET_LENGTH(item) == 1) {
7966 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7967 if (value == 0xFFFE)
7968 goto Undefined;
7969 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7970 goto onError;
7971 }
7972 else {
7973 writer->overallocate = 1;
7974 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7975 goto onError;
7976 }
7977 }
7978 else {
7979 /* wrong return value */
7980 PyErr_SetString(PyExc_TypeError,
7981 "character mapping must return integer, None or str");
7982 goto onError;
7983 }
7984 Py_CLEAR(item);
7985 ++s;
7986 continue;
7987
7988Undefined:
7989 /* undefined mapping */
7990 Py_CLEAR(item);
7991 startinpos = s-starts;
7992 endinpos = startinpos+1;
7993 if (unicode_decode_call_errorhandler_writer(
7994 errors, &errorHandler,
7995 "charmap", "character maps to <undefined>",
7996 &starts, &e, &startinpos, &endinpos, &exc, &s,
7997 writer)) {
7998 goto onError;
7999 }
8000 }
8001 Py_XDECREF(errorHandler);
8002 Py_XDECREF(exc);
8003 return 0;
8004
8005onError:
8006 Py_XDECREF(item);
8007 Py_XDECREF(errorHandler);
8008 Py_XDECREF(exc);
8009 return -1;
8010}
8011
Alexander Belopolsky40018472011-02-26 01:02:56 +00008012PyObject *
8013PyUnicode_DecodeCharmap(const char *s,
8014 Py_ssize_t size,
8015 PyObject *mapping,
8016 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008018 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008019
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 /* Default to Latin-1 */
8021 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008025 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008026 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008027 writer.min_length = size;
8028 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008030
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008031 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008032 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8033 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008034 }
8035 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008036 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008039 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008040
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008042 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 return NULL;
8044}
8045
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046/* Charmap encoding: the lookup table */
8047
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 PyObject_HEAD
8050 unsigned char level1[32];
8051 int count2, count3;
8052 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053};
8054
8055static PyObject*
8056encoding_map_size(PyObject *obj, PyObject* args)
8057{
8058 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061}
8062
8063static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008064 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 PyDoc_STR("Return the size (in bytes) of this object") },
8066 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067};
8068
8069static void
8070encoding_map_dealloc(PyObject* o)
8071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008072 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008073}
8074
8075static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008076 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 "EncodingMap", /*tp_name*/
8078 sizeof(struct encoding_map), /*tp_basicsize*/
8079 0, /*tp_itemsize*/
8080 /* methods */
8081 encoding_map_dealloc, /*tp_dealloc*/
8082 0, /*tp_print*/
8083 0, /*tp_getattr*/
8084 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008085 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 0, /*tp_repr*/
8087 0, /*tp_as_number*/
8088 0, /*tp_as_sequence*/
8089 0, /*tp_as_mapping*/
8090 0, /*tp_hash*/
8091 0, /*tp_call*/
8092 0, /*tp_str*/
8093 0, /*tp_getattro*/
8094 0, /*tp_setattro*/
8095 0, /*tp_as_buffer*/
8096 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8097 0, /*tp_doc*/
8098 0, /*tp_traverse*/
8099 0, /*tp_clear*/
8100 0, /*tp_richcompare*/
8101 0, /*tp_weaklistoffset*/
8102 0, /*tp_iter*/
8103 0, /*tp_iternext*/
8104 encoding_map_methods, /*tp_methods*/
8105 0, /*tp_members*/
8106 0, /*tp_getset*/
8107 0, /*tp_base*/
8108 0, /*tp_dict*/
8109 0, /*tp_descr_get*/
8110 0, /*tp_descr_set*/
8111 0, /*tp_dictoffset*/
8112 0, /*tp_init*/
8113 0, /*tp_alloc*/
8114 0, /*tp_new*/
8115 0, /*tp_free*/
8116 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117};
8118
8119PyObject*
8120PyUnicode_BuildEncodingMap(PyObject* string)
8121{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 PyObject *result;
8123 struct encoding_map *mresult;
8124 int i;
8125 int need_dict = 0;
8126 unsigned char level1[32];
8127 unsigned char level2[512];
8128 unsigned char *mlevel1, *mlevel2, *mlevel3;
8129 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 int kind;
8131 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008132 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008135 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 PyErr_BadArgument();
8137 return NULL;
8138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 kind = PyUnicode_KIND(string);
8140 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008141 length = PyUnicode_GET_LENGTH(string);
8142 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 memset(level1, 0xFF, sizeof level1);
8144 memset(level2, 0xFF, sizeof level2);
8145
8146 /* If there isn't a one-to-one mapping of NULL to \0,
8147 or if there are non-BMP characters, we need to use
8148 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008151 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 ch = PyUnicode_READ(kind, data, i);
8154 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 need_dict = 1;
8156 break;
8157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 /* unmapped character */
8160 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 l1 = ch >> 11;
8162 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 if (level1[l1] == 0xFF)
8164 level1[l1] = count2++;
8165 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008166 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 }
8168
8169 if (count2 >= 0xFF || count3 >= 0xFF)
8170 need_dict = 1;
8171
8172 if (need_dict) {
8173 PyObject *result = PyDict_New();
8174 PyObject *key, *value;
8175 if (!result)
8176 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008177 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008179 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180 if (!key || !value)
8181 goto failed1;
8182 if (PyDict_SetItem(result, key, value) == -1)
8183 goto failed1;
8184 Py_DECREF(key);
8185 Py_DECREF(value);
8186 }
8187 return result;
8188 failed1:
8189 Py_XDECREF(key);
8190 Py_XDECREF(value);
8191 Py_DECREF(result);
8192 return NULL;
8193 }
8194
8195 /* Create a three-level trie */
8196 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8197 16*count2 + 128*count3 - 1);
8198 if (!result)
8199 return PyErr_NoMemory();
8200 PyObject_Init(result, &EncodingMapType);
8201 mresult = (struct encoding_map*)result;
8202 mresult->count2 = count2;
8203 mresult->count3 = count3;
8204 mlevel1 = mresult->level1;
8205 mlevel2 = mresult->level23;
8206 mlevel3 = mresult->level23 + 16*count2;
8207 memcpy(mlevel1, level1, 32);
8208 memset(mlevel2, 0xFF, 16*count2);
8209 memset(mlevel3, 0, 128*count3);
8210 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008211 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008213 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8214 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 /* unmapped character */
8216 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008217 o1 = ch>>11;
8218 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008219 i2 = 16*mlevel1[o1] + o2;
8220 if (mlevel2[i2] == 0xFF)
8221 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008222 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223 i3 = 128*mlevel2[i2] + o3;
8224 mlevel3[i3] = i;
8225 }
8226 return result;
8227}
8228
8229static int
Victor Stinner22168992011-11-20 17:09:18 +01008230encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231{
8232 struct encoding_map *map = (struct encoding_map*)mapping;
8233 int l1 = c>>11;
8234 int l2 = (c>>7) & 0xF;
8235 int l3 = c & 0x7F;
8236 int i;
8237
Victor Stinner22168992011-11-20 17:09:18 +01008238 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008240 if (c == 0)
8241 return 0;
8242 /* level 1*/
8243 i = map->level1[l1];
8244 if (i == 0xFF) {
8245 return -1;
8246 }
8247 /* level 2*/
8248 i = map->level23[16*i+l2];
8249 if (i == 0xFF) {
8250 return -1;
8251 }
8252 /* level 3 */
8253 i = map->level23[16*map->count2 + 128*i + l3];
8254 if (i == 0) {
8255 return -1;
8256 }
8257 return i;
8258}
8259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260/* Lookup the character ch in the mapping. If the character
8261 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008262 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008263static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008264charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265{
Christian Heimes217cfd12007-12-02 14:31:20 +00008266 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 PyObject *x;
8268
8269 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 x = PyObject_GetItem(mapping, w);
8272 Py_DECREF(w);
8273 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8275 /* No mapping found means: mapping is undefined. */
8276 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008277 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 } else
8279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008281 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008283 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 long value = PyLong_AS_LONG(x);
8285 if (value < 0 || value > 255) {
8286 PyErr_SetString(PyExc_TypeError,
8287 "character mapping must be in range(256)");
8288 Py_DECREF(x);
8289 return NULL;
8290 }
8291 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008293 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 /* wrong return value */
8297 PyErr_Format(PyExc_TypeError,
8298 "character mapping must return integer, bytes or None, not %.400s",
8299 x->ob_type->tp_name);
8300 Py_DECREF(x);
8301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 }
8303}
8304
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008306charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008308 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8309 /* exponentially overallocate to minimize reallocations */
8310 if (requiredsize < 2*outsize)
8311 requiredsize = 2*outsize;
8312 if (_PyBytes_Resize(outobj, requiredsize))
8313 return -1;
8314 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315}
8316
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008319} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008321 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 space is available. Return a new reference to the object that
8323 was put in the output buffer, or Py_None, if the mapping was undefined
8324 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008325 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008327charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 PyObject *rep;
8331 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008332 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333
Christian Heimes90aa7642007-12-19 02:45:37 +00008334 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 if (res == -1)
8338 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 if (outsize<requiredsize)
8340 if (charmapencode_resize(outobj, outpos, requiredsize))
8341 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008342 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 outstart[(*outpos)++] = (char)res;
8344 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 }
8346
8347 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 Py_DECREF(rep);
8352 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 if (PyLong_Check(rep)) {
8355 Py_ssize_t requiredsize = *outpos+1;
8356 if (outsize<requiredsize)
8357 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8358 Py_DECREF(rep);
8359 return enc_EXCEPTION;
8360 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008361 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 else {
8365 const char *repchars = PyBytes_AS_STRING(rep);
8366 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8367 Py_ssize_t requiredsize = *outpos+repsize;
8368 if (outsize<requiredsize)
8369 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8370 Py_DECREF(rep);
8371 return enc_EXCEPTION;
8372 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008373 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 memcpy(outstart + *outpos, repchars, repsize);
8375 *outpos += repsize;
8376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 Py_DECREF(rep);
8379 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380}
8381
8382/* handle an error in PyUnicode_EncodeCharmap
8383 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008384static int
8385charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008388 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008389 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390{
8391 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008392 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008393 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008394 enum PyUnicode_Kind kind;
8395 void *data;
8396 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t collstartpos = *inpos;
8399 Py_ssize_t collendpos = *inpos+1;
8400 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008401 const char *encoding = "charmap";
8402 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008403 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008405 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406
Benjamin Petersonbac79492012-01-14 13:34:47 -05008407 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008408 return -1;
8409 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 /* find all unencodable characters */
8411 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008413 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008414 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008415 val = encoding_map_lookup(ch, mapping);
8416 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 break;
8418 ++collendpos;
8419 continue;
8420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008422 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8423 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 if (rep==NULL)
8425 return -1;
8426 else if (rep!=Py_None) {
8427 Py_DECREF(rep);
8428 break;
8429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
8433 /* cache callback name lookup
8434 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008435 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008436 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008437
8438 switch (*error_handler) {
8439 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008440 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008442
8443 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 x = charmapencode_output('?', mapping, res, respos);
8446 if (x==enc_EXCEPTION) {
8447 return -1;
8448 }
8449 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008450 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return -1;
8452 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 }
8454 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008455 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456 *inpos = collendpos;
8457 break;
Victor Stinner50149202015-09-22 00:26:54 +02008458
8459 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 /* generate replacement (temporarily (mis)uses p) */
8461 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 char buffer[2+29+1+1];
8463 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008464 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 for (cp = buffer; *cp; ++cp) {
8466 x = charmapencode_output(*cp, mapping, res, respos);
8467 if (x==enc_EXCEPTION)
8468 return -1;
8469 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008470 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 return -1;
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 }
8474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 *inpos = collendpos;
8476 break;
Victor Stinner50149202015-09-22 00:26:54 +02008477
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 default:
Victor Stinner50149202015-09-22 00:26:54 +02008479 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008480 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008484 if (PyBytes_Check(repunicode)) {
8485 /* Directly copy bytes result to output. */
8486 Py_ssize_t outsize = PyBytes_Size(*res);
8487 Py_ssize_t requiredsize;
8488 repsize = PyBytes_Size(repunicode);
8489 requiredsize = *respos + repsize;
8490 if (requiredsize > outsize)
8491 /* Make room for all additional bytes. */
8492 if (charmapencode_resize(res, respos, requiredsize)) {
8493 Py_DECREF(repunicode);
8494 return -1;
8495 }
8496 memcpy(PyBytes_AsString(*res) + *respos,
8497 PyBytes_AsString(repunicode), repsize);
8498 *respos += repsize;
8499 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008500 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008501 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008503 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008504 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008505 Py_DECREF(repunicode);
8506 return -1;
8507 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008508 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008509 data = PyUnicode_DATA(repunicode);
8510 kind = PyUnicode_KIND(repunicode);
8511 for (index = 0; index < repsize; index++) {
8512 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8513 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008515 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
8517 }
8518 else if (x==enc_FAILED) {
8519 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008520 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return -1;
8522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008523 }
8524 *inpos = newpos;
8525 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 }
8527 return 0;
8528}
8529
Alexander Belopolsky40018472011-02-26 01:02:56 +00008530PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008531_PyUnicode_EncodeCharmap(PyObject *unicode,
8532 PyObject *mapping,
8533 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 /* output object */
8536 PyObject *res = NULL;
8537 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008541 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008542 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008544 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008545 void *data;
8546 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547
Benjamin Petersonbac79492012-01-14 13:34:47 -05008548 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549 return NULL;
8550 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008551 data = PyUnicode_DATA(unicode);
8552 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008553
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 /* Default to Latin-1 */
8555 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 /* allocate enough for a simple encoding without
8559 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008560 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 if (res == NULL)
8562 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008563 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008567 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 if (x==enc_EXCEPTION) /* error */
8571 goto onError;
8572 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008575 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 &res, &respos)) {
8577 goto onError;
8578 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 else
8581 /* done with this character => adjust input position */
8582 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008586 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008587 if (_PyBytes_Resize(&res, respos) < 0)
8588 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008591 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 return res;
8593
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 Py_XDECREF(res);
8596 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008597 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 return NULL;
8599}
8600
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601/* Deprecated */
8602PyObject *
8603PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8604 Py_ssize_t size,
8605 PyObject *mapping,
8606 const char *errors)
8607{
8608 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008609 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008610 if (unicode == NULL)
8611 return NULL;
8612 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8613 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008614 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008615}
8616
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617PyObject *
8618PyUnicode_AsCharmapString(PyObject *unicode,
8619 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620{
8621 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 PyErr_BadArgument();
8623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008625 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626}
8627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629static void
8630make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632 Py_ssize_t startpos, Py_ssize_t endpos,
8633 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 *exceptionObject = _PyUnicodeTranslateError_Create(
8637 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 }
8639 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8641 goto onError;
8642 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8643 goto onError;
8644 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8645 goto onError;
8646 return;
8647 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008648 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 }
8650}
8651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652/* error handling callback helper:
8653 build arguments, call the callback and check the arguments,
8654 put the result into newpos and return the replacement string, which
8655 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008656static PyObject *
8657unicode_translate_call_errorhandler(const char *errors,
8658 PyObject **errorHandler,
8659 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008661 Py_ssize_t startpos, Py_ssize_t endpos,
8662 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008664 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008666 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 PyObject *restuple;
8668 PyObject *resunicode;
8669
8670 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 }
8675
8676 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008681 restuple = PyObject_CallFunctionObjArgs(
8682 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008686 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 Py_DECREF(restuple);
8688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008690 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 &resunicode, &i_newpos)) {
8692 Py_DECREF(restuple);
8693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 else
8698 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008700 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 Py_DECREF(restuple);
8702 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704 Py_INCREF(resunicode);
8705 Py_DECREF(restuple);
8706 return resunicode;
8707}
8708
8709/* Lookup the character ch in the mapping and put the result in result,
8710 which must be decrefed by the caller.
8711 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008712static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714{
Christian Heimes217cfd12007-12-02 14:31:20 +00008715 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 PyObject *x;
8717
8718 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720 x = PyObject_GetItem(mapping, w);
8721 Py_DECREF(w);
8722 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8724 /* No mapping found means: use 1:1 mapping. */
8725 PyErr_Clear();
8726 *result = NULL;
8727 return 0;
8728 } else
8729 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 }
8731 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 *result = x;
8733 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008735 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008737 if (value < 0 || value > MAX_UNICODE) {
8738 PyErr_Format(PyExc_ValueError,
8739 "character mapping must be in range(0x%x)",
8740 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 Py_DECREF(x);
8742 return -1;
8743 }
8744 *result = x;
8745 return 0;
8746 }
8747 else if (PyUnicode_Check(x)) {
8748 *result = x;
8749 return 0;
8750 }
8751 else {
8752 /* wrong return value */
8753 PyErr_SetString(PyExc_TypeError,
8754 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008755 Py_DECREF(x);
8756 return -1;
8757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758}
Victor Stinner1194ea02014-04-04 19:37:40 +02008759
8760/* lookup the character, write the result into the writer.
8761 Return 1 if the result was written into the writer, return 0 if the mapping
8762 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008764charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8765 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766{
Victor Stinner1194ea02014-04-04 19:37:40 +02008767 PyObject *item;
8768
8769 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008771
8772 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008777 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008778 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008779
8780 if (item == Py_None) {
8781 Py_DECREF(item);
8782 return 0;
8783 }
8784
8785 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008786 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8787 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8788 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008789 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8790 Py_DECREF(item);
8791 return -1;
8792 }
8793 Py_DECREF(item);
8794 return 1;
8795 }
8796
8797 if (!PyUnicode_Check(item)) {
8798 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008800 }
8801
8802 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8803 Py_DECREF(item);
8804 return -1;
8805 }
8806
8807 Py_DECREF(item);
8808 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809}
8810
Victor Stinner89a76ab2014-04-05 11:44:04 +02008811static int
8812unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8813 Py_UCS1 *translate)
8814{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008815 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 int ret = 0;
8817
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 if (charmaptranslate_lookup(ch, mapping, &item)) {
8819 return -1;
8820 }
8821
8822 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008823 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008824 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008826 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 /* not found => default to 1:1 mapping */
8828 translate[ch] = ch;
8829 return 1;
8830 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008831 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008832 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008833 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8834 used it */
8835 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008836 /* invalid character or character outside ASCII:
8837 skip the fast translate */
8838 goto exit;
8839 }
8840 translate[ch] = (Py_UCS1)replace;
8841 }
8842 else if (PyUnicode_Check(item)) {
8843 Py_UCS4 replace;
8844
8845 if (PyUnicode_READY(item) == -1) {
8846 Py_DECREF(item);
8847 return -1;
8848 }
8849 if (PyUnicode_GET_LENGTH(item) != 1)
8850 goto exit;
8851
8852 replace = PyUnicode_READ_CHAR(item, 0);
8853 if (replace > 127)
8854 goto exit;
8855 translate[ch] = (Py_UCS1)replace;
8856 }
8857 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008858 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 goto exit;
8860 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 ret = 1;
8862
Benjamin Peterson1365de72014-04-07 20:15:41 -04008863 exit:
8864 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 return ret;
8866}
8867
8868/* Fast path for ascii => ascii translation. Return 1 if the whole string
8869 was translated into writer, return 0 if the input string was partially
8870 translated into writer, raise an exception and return -1 on error. */
8871static int
8872unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008873 _PyUnicodeWriter *writer, int ignore,
8874 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875{
Victor Stinner872b2912014-04-05 14:27:07 +02008876 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 Py_ssize_t len;
8878 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008879 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880
Victor Stinner89a76ab2014-04-05 11:44:04 +02008881 len = PyUnicode_GET_LENGTH(input);
8882
Victor Stinner872b2912014-04-05 14:27:07 +02008883 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884
8885 in = PyUnicode_1BYTE_DATA(input);
8886 end = in + len;
8887
8888 assert(PyUnicode_IS_ASCII(writer->buffer));
8889 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8890 out = PyUnicode_1BYTE_DATA(writer->buffer);
8891
Victor Stinner872b2912014-04-05 14:27:07 +02008892 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008894 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008896 int translate = unicode_fast_translate_lookup(mapping, ch,
8897 ascii_table);
8898 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008900 if (translate == 0)
8901 goto exit;
8902 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903 }
Victor Stinner872b2912014-04-05 14:27:07 +02008904 if (ch2 == 0xfe) {
8905 if (ignore)
8906 continue;
8907 goto exit;
8908 }
8909 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008911 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 }
Victor Stinner872b2912014-04-05 14:27:07 +02008913 res = 1;
8914
8915exit:
8916 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008917 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008918 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919}
8920
Victor Stinner3222da22015-10-01 22:07:32 +02008921static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922_PyUnicode_TranslateCharmap(PyObject *input,
8923 PyObject *mapping,
8924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 Py_ssize_t size, i;
8929 int kind;
8930 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 _PyUnicodeWriter writer;
8932 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008933 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934 PyObject *errorHandler = NULL;
8935 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008936 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 PyErr_BadArgument();
8941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 if (PyUnicode_READY(input) == -1)
8945 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008946 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 kind = PyUnicode_KIND(input);
8948 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008950 if (size == 0)
8951 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008953 /* allocate enough for a simple 1:1 translation without
8954 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008955 _PyUnicodeWriter_Init(&writer);
8956 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958
Victor Stinner872b2912014-04-05 14:27:07 +02008959 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8960
Victor Stinner33798672016-03-01 21:59:58 +01008961 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008962 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008963 if (PyUnicode_IS_ASCII(input)) {
8964 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8965 if (res < 0) {
8966 _PyUnicodeWriter_Dealloc(&writer);
8967 return NULL;
8968 }
8969 if (res == 1)
8970 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008971 }
Victor Stinner33798672016-03-01 21:59:58 +01008972 else {
8973 i = 0;
8974 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 int translate;
8979 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8980 Py_ssize_t newpos;
8981 /* startpos for collecting untranslatable chars */
8982 Py_ssize_t collstart;
8983 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 ch = PyUnicode_READ(kind, data, i);
8987 translate = charmaptranslate_output(ch, mapping, &writer);
8988 if (translate < 0)
8989 goto onError;
8990
8991 if (translate != 0) {
8992 /* it worked => adjust input pointer */
8993 ++i;
8994 continue;
8995 }
8996
8997 /* untranslatable character */
8998 collstart = i;
8999 collend = i+1;
9000
9001 /* find all untranslatable characters */
9002 while (collend < size) {
9003 PyObject *x;
9004 ch = PyUnicode_READ(kind, data, collend);
9005 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009007 Py_XDECREF(x);
9008 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009010 ++collend;
9011 }
9012
9013 if (ignore) {
9014 i = collend;
9015 }
9016 else {
9017 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9018 reason, input, &exc,
9019 collstart, collend, &newpos);
9020 if (repunicode == NULL)
9021 goto onError;
9022 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009025 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 Py_DECREF(repunicode);
9027 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009028 }
9029 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030 Py_XDECREF(exc);
9031 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009032 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009036 Py_XDECREF(exc);
9037 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 return NULL;
9039}
9040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041/* Deprecated. Use PyUnicode_Translate instead. */
9042PyObject *
9043PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9044 Py_ssize_t size,
9045 PyObject *mapping,
9046 const char *errors)
9047{
Christian Heimes5f520f42012-09-11 14:03:25 +02009048 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009049 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 if (!unicode)
9051 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009052 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9053 Py_DECREF(unicode);
9054 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055}
9056
Alexander Belopolsky40018472011-02-26 01:02:56 +00009057PyObject *
9058PyUnicode_Translate(PyObject *str,
9059 PyObject *mapping,
9060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009062 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009063 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009064 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065}
Tim Petersced69f82003-09-16 20:30:58 +00009066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067PyObject *
9068_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9069{
9070 if (!PyUnicode_Check(unicode)) {
9071 PyErr_BadInternalCall();
9072 return NULL;
9073 }
9074 if (PyUnicode_READY(unicode) == -1)
9075 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009076 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 /* If the string is already ASCII, just return the same string */
9078 Py_INCREF(unicode);
9079 return unicode;
9080 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009081
9082 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9083 PyObject *result = PyUnicode_New(len, 127);
9084 if (result == NULL) {
9085 return NULL;
9086 }
9087
9088 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9089 int kind = PyUnicode_KIND(unicode);
9090 const void *data = PyUnicode_DATA(unicode);
9091 Py_ssize_t i;
9092 for (i = 0; i < len; ++i) {
9093 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9094 if (ch < 127) {
9095 out[i] = ch;
9096 }
9097 else if (Py_UNICODE_ISSPACE(ch)) {
9098 out[i] = ' ';
9099 }
9100 else {
9101 int decimal = Py_UNICODE_TODECIMAL(ch);
9102 if (decimal < 0) {
9103 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009104 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009105 _PyUnicode_LENGTH(result) = i + 1;
9106 break;
9107 }
9108 out[i] = '0' + decimal;
9109 }
9110 }
9111
INADA Naoki16dfca42018-07-14 12:06:43 +09009112 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009113 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114}
9115
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009116PyObject *
9117PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9118 Py_ssize_t length)
9119{
Victor Stinnerf0124502011-11-21 23:12:56 +01009120 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009121 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 Py_UCS4 maxchar;
9123 enum PyUnicode_Kind kind;
9124 void *data;
9125
Victor Stinner99d7ad02012-02-22 13:37:39 +01009126 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009127 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009128 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 if (ch > 127) {
9130 int decimal = Py_UNICODE_TODECIMAL(ch);
9131 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009132 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009133 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009134 }
9135 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009136
9137 /* Copy to a new string */
9138 decimal = PyUnicode_New(length, maxchar);
9139 if (decimal == NULL)
9140 return decimal;
9141 kind = PyUnicode_KIND(decimal);
9142 data = PyUnicode_DATA(decimal);
9143 /* Iterate over code points */
9144 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009145 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009146 if (ch > 127) {
9147 int decimal = Py_UNICODE_TODECIMAL(ch);
9148 if (decimal >= 0)
9149 ch = '0' + decimal;
9150 }
9151 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009153 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009155/* --- Decimal Encoder ---------------------------------------------------- */
9156
Alexander Belopolsky40018472011-02-26 01:02:56 +00009157int
9158PyUnicode_EncodeDecimal(Py_UNICODE *s,
9159 Py_ssize_t length,
9160 char *output,
9161 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009162{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009163 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009164 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009165 enum PyUnicode_Kind kind;
9166 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009167
9168 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 PyErr_BadArgument();
9170 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009171 }
9172
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009173 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009174 if (unicode == NULL)
9175 return -1;
9176
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 kind = PyUnicode_KIND(unicode);
9178 data = PyUnicode_DATA(unicode);
9179
Victor Stinnerb84d7232011-11-22 01:50:07 +01009180 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009181 PyObject *exc;
9182 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009184 Py_ssize_t startpos;
9185
9186 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009187
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009189 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009190 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 decimal = Py_UNICODE_TODECIMAL(ch);
9194 if (decimal >= 0) {
9195 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009196 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 continue;
9198 }
9199 if (0 < ch && ch < 256) {
9200 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009201 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 continue;
9203 }
Victor Stinner6345be92011-11-25 20:09:01 +01009204
Victor Stinner42bf7752011-11-21 22:52:58 +01009205 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009206 exc = NULL;
9207 raise_encode_exception(&exc, "decimal", unicode,
9208 startpos, startpos+1,
9209 "invalid decimal Unicode string");
9210 Py_XDECREF(exc);
9211 Py_DECREF(unicode);
9212 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009213 }
9214 /* 0-terminate the output string */
9215 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009216 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009217 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009218}
9219
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220/* --- Helpers ------------------------------------------------------------ */
9221
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009222/* helper macro to fixup start/end slice values */
9223#define ADJUST_INDICES(start, end, len) \
9224 if (end > len) \
9225 end = len; \
9226 else if (end < 0) { \
9227 end += len; \
9228 if (end < 0) \
9229 end = 0; \
9230 } \
9231 if (start < 0) { \
9232 start += len; \
9233 if (start < 0) \
9234 start = 0; \
9235 }
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009238any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240 Py_ssize_t end,
9241 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009243 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 void *buf1, *buf2;
9245 Py_ssize_t len1, len2, result;
9246
9247 kind1 = PyUnicode_KIND(s1);
9248 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249 if (kind1 < kind2)
9250 return -1;
9251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 len1 = PyUnicode_GET_LENGTH(s1);
9253 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009254 ADJUST_INDICES(start, end, len1);
9255 if (end - start < len2)
9256 return -1;
9257
9258 buf1 = PyUnicode_DATA(s1);
9259 buf2 = PyUnicode_DATA(s2);
9260 if (len2 == 1) {
9261 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9262 result = findchar((const char *)buf1 + kind1*start,
9263 kind1, end - start, ch, direction);
9264 if (result == -1)
9265 return -1;
9266 else
9267 return start + result;
9268 }
9269
9270 if (kind2 != kind1) {
9271 buf2 = _PyUnicode_AsKind(s2, kind1);
9272 if (!buf2)
9273 return -2;
9274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275
Victor Stinner794d5672011-10-10 03:21:36 +02009276 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009277 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009278 case PyUnicode_1BYTE_KIND:
9279 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9280 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9281 else
9282 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9283 break;
9284 case PyUnicode_2BYTE_KIND:
9285 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 case PyUnicode_4BYTE_KIND:
9288 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009291 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009292 }
9293 }
9294 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009295 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009296 case PyUnicode_1BYTE_KIND:
9297 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9298 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9299 else
9300 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 break;
9302 case PyUnicode_2BYTE_KIND:
9303 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 break;
9305 case PyUnicode_4BYTE_KIND:
9306 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 break;
9308 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009309 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 }
9312
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009313 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 PyMem_Free(buf2);
9315
9316 return result;
9317}
9318
9319Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009320_PyUnicode_InsertThousandsGrouping(
9321 PyObject *unicode, Py_ssize_t index,
9322 Py_ssize_t n_buffer,
9323 void *digits, Py_ssize_t n_digits,
9324 Py_ssize_t min_width,
9325 const char *grouping, PyObject *thousands_sep,
9326 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327{
Victor Stinner41a863c2012-02-24 00:37:51 +01009328 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009329 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 Py_ssize_t thousands_sep_len;
9331 Py_ssize_t len;
9332
9333 if (unicode != NULL) {
9334 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009335 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 }
9337 else {
9338 kind = PyUnicode_1BYTE_KIND;
9339 data = NULL;
9340 }
9341 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9342 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9343 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9344 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009345 if (thousands_sep_kind < kind) {
9346 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9347 if (!thousands_sep_data)
9348 return -1;
9349 }
9350 else {
9351 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9352 if (!data)
9353 return -1;
9354 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009355 }
9356
Benjamin Petersonead6b532011-12-20 17:23:42 -06009357 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009359 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009361 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009364 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009368 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009372 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009378 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 break;
9382 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009383 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009385 if (unicode != NULL && thousands_sep_kind != kind) {
9386 if (thousands_sep_kind < kind)
9387 PyMem_Free(thousands_sep_data);
9388 else
9389 PyMem_Free(data);
9390 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009391 if (unicode == NULL) {
9392 *maxchar = 127;
9393 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009394 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009395 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 }
9397 }
9398 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399}
9400
9401
Alexander Belopolsky40018472011-02-26 01:02:56 +00009402Py_ssize_t
9403PyUnicode_Count(PyObject *str,
9404 PyObject *substr,
9405 Py_ssize_t start,
9406 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009408 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009409 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 void *buf1 = NULL, *buf2 = NULL;
9411 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009412
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009413 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 kind1 = PyUnicode_KIND(str);
9417 kind2 = PyUnicode_KIND(substr);
9418 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009419 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009420
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009421 len1 = PyUnicode_GET_LENGTH(str);
9422 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009424 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009425 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009427 buf1 = PyUnicode_DATA(str);
9428 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 if (!buf2)
9432 goto onError;
9433 }
9434
9435 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009437 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009438 result = asciilib_count(
9439 ((Py_UCS1*)buf1) + start, end - start,
9440 buf2, len2, PY_SSIZE_T_MAX
9441 );
9442 else
9443 result = ucs1lib_count(
9444 ((Py_UCS1*)buf1) + start, end - start,
9445 buf2, len2, PY_SSIZE_T_MAX
9446 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 break;
9448 case PyUnicode_2BYTE_KIND:
9449 result = ucs2lib_count(
9450 ((Py_UCS2*)buf1) + start, end - start,
9451 buf2, len2, PY_SSIZE_T_MAX
9452 );
9453 break;
9454 case PyUnicode_4BYTE_KIND:
9455 result = ucs4lib_count(
9456 ((Py_UCS4*)buf1) + start, end - start,
9457 buf2, len2, PY_SSIZE_T_MAX
9458 );
9459 break;
9460 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009461 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009463
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009464 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 PyMem_Free(buf2);
9466
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009469 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 PyMem_Free(buf2);
9471 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472}
9473
Alexander Belopolsky40018472011-02-26 01:02:56 +00009474Py_ssize_t
9475PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009476 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477 Py_ssize_t start,
9478 Py_ssize_t end,
9479 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009481 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009483
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485}
9486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487Py_ssize_t
9488PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9489 Py_ssize_t start, Py_ssize_t end,
9490 int direction)
9491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009493 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 if (PyUnicode_READY(str) == -1)
9495 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009496 len = PyUnicode_GET_LENGTH(str);
9497 ADJUST_INDICES(start, end, len);
9498 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009499 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009501 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9502 kind, end-start, ch, direction);
9503 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009505 else
9506 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507}
9508
Alexander Belopolsky40018472011-02-26 01:02:56 +00009509static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009510tailmatch(PyObject *self,
9511 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009512 Py_ssize_t start,
9513 Py_ssize_t end,
9514 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 int kind_self;
9517 int kind_sub;
9518 void *data_self;
9519 void *data_sub;
9520 Py_ssize_t offset;
9521 Py_ssize_t i;
9522 Py_ssize_t end_sub;
9523
9524 if (PyUnicode_READY(self) == -1 ||
9525 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009526 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9529 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009533 if (PyUnicode_GET_LENGTH(substring) == 0)
9534 return 1;
9535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 kind_self = PyUnicode_KIND(self);
9537 data_self = PyUnicode_DATA(self);
9538 kind_sub = PyUnicode_KIND(substring);
9539 data_sub = PyUnicode_DATA(substring);
9540 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9541
9542 if (direction > 0)
9543 offset = end;
9544 else
9545 offset = start;
9546
9547 if (PyUnicode_READ(kind_self, data_self, offset) ==
9548 PyUnicode_READ(kind_sub, data_sub, 0) &&
9549 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9550 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9551 /* If both are of the same kind, memcmp is sufficient */
9552 if (kind_self == kind_sub) {
9553 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009554 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 data_sub,
9556 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009557 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009559 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 else {
9561 /* We do not need to compare 0 and len(substring)-1 because
9562 the if statement above ensured already that they are equal
9563 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 for (i = 1; i < end_sub; ++i) {
9565 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9566 PyUnicode_READ(kind_sub, data_sub, i))
9567 return 0;
9568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 }
9572
9573 return 0;
9574}
9575
Alexander Belopolsky40018472011-02-26 01:02:56 +00009576Py_ssize_t
9577PyUnicode_Tailmatch(PyObject *str,
9578 PyObject *substr,
9579 Py_ssize_t start,
9580 Py_ssize_t end,
9581 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009583 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009585
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009586 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587}
9588
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009589static PyObject *
9590ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009592 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9593 char *resdata, *data = PyUnicode_DATA(self);
9594 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009595
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009596 res = PyUnicode_New(len, 127);
9597 if (res == NULL)
9598 return NULL;
9599 resdata = PyUnicode_DATA(res);
9600 if (lower)
9601 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009603 _Py_bytes_upper(resdata, data, len);
9604 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605}
9606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009608handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610 Py_ssize_t j;
9611 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009612 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009613 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009614
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9616
9617 where ! is a negation and \p{xxx} is a character with property xxx.
9618 */
9619 for (j = i - 1; j >= 0; j--) {
9620 c = PyUnicode_READ(kind, data, j);
9621 if (!_PyUnicode_IsCaseIgnorable(c))
9622 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9625 if (final_sigma) {
9626 for (j = i + 1; j < length; j++) {
9627 c = PyUnicode_READ(kind, data, j);
9628 if (!_PyUnicode_IsCaseIgnorable(c))
9629 break;
9630 }
9631 final_sigma = j == length || !_PyUnicode_IsCased(c);
9632 }
9633 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634}
9635
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009636static int
9637lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9638 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640 /* Obscure special case. */
9641 if (c == 0x3A3) {
9642 mapped[0] = handle_capital_sigma(kind, data, length, i);
9643 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009645 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646}
9647
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009648static Py_ssize_t
9649do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 Py_ssize_t i, k = 0;
9652 int n_res, j;
9653 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 c = PyUnicode_READ(kind, data, 0);
9656 n_res = _PyUnicode_ToUpperFull(c, mapped);
9657 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009658 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 for (i = 1; i < length; i++) {
9662 c = PyUnicode_READ(kind, data, i);
9663 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9664 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009665 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009667 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009668 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
9671
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672static Py_ssize_t
9673do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9674 Py_ssize_t i, k = 0;
9675
9676 for (i = 0; i < length; i++) {
9677 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9678 int n_res, j;
9679 if (Py_UNICODE_ISUPPER(c)) {
9680 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9681 }
9682 else if (Py_UNICODE_ISLOWER(c)) {
9683 n_res = _PyUnicode_ToUpperFull(c, mapped);
9684 }
9685 else {
9686 n_res = 1;
9687 mapped[0] = c;
9688 }
9689 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009690 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691 res[k++] = mapped[j];
9692 }
9693 }
9694 return k;
9695}
9696
9697static Py_ssize_t
9698do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9699 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 Py_ssize_t i, k = 0;
9702
9703 for (i = 0; i < length; i++) {
9704 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9705 int n_res, j;
9706 if (lower)
9707 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9708 else
9709 n_res = _PyUnicode_ToUpperFull(c, mapped);
9710 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009711 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 res[k++] = mapped[j];
9713 }
9714 }
9715 return k;
9716}
9717
9718static Py_ssize_t
9719do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9720{
9721 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9722}
9723
9724static Py_ssize_t
9725do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9726{
9727 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9728}
9729
Benjamin Petersone51757f2012-01-12 21:10:29 -05009730static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009731do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9732{
9733 Py_ssize_t i, k = 0;
9734
9735 for (i = 0; i < length; i++) {
9736 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9737 Py_UCS4 mapped[3];
9738 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9739 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009740 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009741 res[k++] = mapped[j];
9742 }
9743 }
9744 return k;
9745}
9746
9747static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009748do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9749{
9750 Py_ssize_t i, k = 0;
9751 int previous_is_cased;
9752
9753 previous_is_cased = 0;
9754 for (i = 0; i < length; i++) {
9755 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9756 Py_UCS4 mapped[3];
9757 int n_res, j;
9758
9759 if (previous_is_cased)
9760 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9761 else
9762 n_res = _PyUnicode_ToTitleFull(c, mapped);
9763
9764 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009765 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009766 res[k++] = mapped[j];
9767 }
9768
9769 previous_is_cased = _PyUnicode_IsCased(c);
9770 }
9771 return k;
9772}
9773
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009774static PyObject *
9775case_operation(PyObject *self,
9776 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9777{
9778 PyObject *res = NULL;
9779 Py_ssize_t length, newlength = 0;
9780 int kind, outkind;
9781 void *data, *outdata;
9782 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9783
Benjamin Petersoneea48462012-01-16 14:28:50 -05009784 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009785
9786 kind = PyUnicode_KIND(self);
9787 data = PyUnicode_DATA(self);
9788 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009789 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009790 PyErr_SetString(PyExc_OverflowError, "string is too long");
9791 return NULL;
9792 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009793 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009794 if (tmp == NULL)
9795 return PyErr_NoMemory();
9796 newlength = perform(kind, data, length, tmp, &maxchar);
9797 res = PyUnicode_New(newlength, maxchar);
9798 if (res == NULL)
9799 goto leave;
9800 tmpend = tmp + newlength;
9801 outdata = PyUnicode_DATA(res);
9802 outkind = PyUnicode_KIND(res);
9803 switch (outkind) {
9804 case PyUnicode_1BYTE_KIND:
9805 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9806 break;
9807 case PyUnicode_2BYTE_KIND:
9808 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9809 break;
9810 case PyUnicode_4BYTE_KIND:
9811 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9812 break;
9813 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009814 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009815 }
9816 leave:
9817 PyMem_FREE(tmp);
9818 return res;
9819}
9820
Tim Peters8ce9f162004-08-27 01:49:32 +00009821PyObject *
9822PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009824 PyObject *res;
9825 PyObject *fseq;
9826 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009827 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009829 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009830 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009832 }
9833
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009834 /* NOTE: the following code can't call back into Python code,
9835 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009836 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009837
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009838 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009839 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009840 res = _PyUnicode_JoinArray(separator, items, seqlen);
9841 Py_DECREF(fseq);
9842 return res;
9843}
9844
9845PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009846_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009847{
9848 PyObject *res = NULL; /* the result */
9849 PyObject *sep = NULL;
9850 Py_ssize_t seplen;
9851 PyObject *item;
9852 Py_ssize_t sz, i, res_offset;
9853 Py_UCS4 maxchar;
9854 Py_UCS4 item_maxchar;
9855 int use_memcpy;
9856 unsigned char *res_data = NULL, *sep_data = NULL;
9857 PyObject *last_obj;
9858 unsigned int kind = 0;
9859
Tim Peters05eba1f2004-08-27 21:32:02 +00009860 /* If empty sequence, return u"". */
9861 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009862 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009864
Tim Peters05eba1f2004-08-27 21:32:02 +00009865 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009866 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009867 if (seqlen == 1) {
9868 if (PyUnicode_CheckExact(items[0])) {
9869 res = items[0];
9870 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009871 return res;
9872 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009873 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009874 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009875 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009876 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009877 /* Set up sep and seplen */
9878 if (separator == NULL) {
9879 /* fall back to a blank space separator */
9880 sep = PyUnicode_FromOrdinal(' ');
9881 if (!sep)
9882 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009883 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009884 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009885 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009886 else {
9887 if (!PyUnicode_Check(separator)) {
9888 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009889 "separator: expected str instance,"
9890 " %.80s found",
9891 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009892 goto onError;
9893 }
9894 if (PyUnicode_READY(separator))
9895 goto onError;
9896 sep = separator;
9897 seplen = PyUnicode_GET_LENGTH(separator);
9898 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9899 /* inc refcount to keep this code path symmetric with the
9900 above case of a blank separator */
9901 Py_INCREF(sep);
9902 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009903 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 }
9905
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009906 /* There are at least two things to join, or else we have a subclass
9907 * of str in the sequence.
9908 * Do a pre-pass to figure out the total amount of space we'll
9909 * need (sz), and see whether all argument are strings.
9910 */
9911 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009912#ifdef Py_DEBUG
9913 use_memcpy = 0;
9914#else
9915 use_memcpy = 1;
9916#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009917 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009918 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009919 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009920 if (!PyUnicode_Check(item)) {
9921 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009922 "sequence item %zd: expected str instance,"
9923 " %.80s found",
9924 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 goto onError;
9926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 if (PyUnicode_READY(item) == -1)
9928 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009929 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009931 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009932 if (i != 0) {
9933 add_sz += seplen;
9934 }
9935 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009936 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 goto onError;
9939 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009940 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009941 if (use_memcpy && last_obj != NULL) {
9942 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9943 use_memcpy = 0;
9944 }
9945 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009946 }
Tim Petersced69f82003-09-16 20:30:58 +00009947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009949 if (res == NULL)
9950 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009951
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009952 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009953#ifdef Py_DEBUG
9954 use_memcpy = 0;
9955#else
9956 if (use_memcpy) {
9957 res_data = PyUnicode_1BYTE_DATA(res);
9958 kind = PyUnicode_KIND(res);
9959 if (seplen != 0)
9960 sep_data = PyUnicode_1BYTE_DATA(sep);
9961 }
9962#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009963 if (use_memcpy) {
9964 for (i = 0; i < seqlen; ++i) {
9965 Py_ssize_t itemlen;
9966 item = items[i];
9967
9968 /* Copy item, and maybe the separator. */
9969 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009970 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009972 kind * seplen);
9973 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009975
9976 itemlen = PyUnicode_GET_LENGTH(item);
9977 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009978 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009979 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009980 kind * itemlen);
9981 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009982 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009983 }
9984 assert(res_data == PyUnicode_1BYTE_DATA(res)
9985 + kind * PyUnicode_GET_LENGTH(res));
9986 }
9987 else {
9988 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9989 Py_ssize_t itemlen;
9990 item = items[i];
9991
9992 /* Copy item, and maybe the separator. */
9993 if (i && seplen != 0) {
9994 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9995 res_offset += seplen;
9996 }
9997
9998 itemlen = PyUnicode_GET_LENGTH(item);
9999 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010000 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 res_offset += itemlen;
10002 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010003 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010005 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010008 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010013 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 return NULL;
10015}
10016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017#define FILL(kind, data, value, start, length) \
10018 do { \
10019 Py_ssize_t i_ = 0; \
10020 assert(kind != PyUnicode_WCHAR_KIND); \
10021 switch ((kind)) { \
10022 case PyUnicode_1BYTE_KIND: { \
10023 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010024 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 break; \
10026 } \
10027 case PyUnicode_2BYTE_KIND: { \
10028 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10029 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10030 break; \
10031 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010032 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10034 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10035 break; \
10036 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010037 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 } \
10039 } while (0)
10040
Victor Stinnerd3f08822012-05-29 12:57:52 +020010041void
10042_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10043 Py_UCS4 fill_char)
10044{
10045 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10046 const void *data = PyUnicode_DATA(unicode);
10047 assert(PyUnicode_IS_READY(unicode));
10048 assert(unicode_modifiable(unicode));
10049 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10050 assert(start >= 0);
10051 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10052 FILL(kind, data, fill_char, start, length);
10053}
10054
Victor Stinner3fe55312012-01-04 00:33:50 +010010055Py_ssize_t
10056PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10057 Py_UCS4 fill_char)
10058{
10059 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010060
10061 if (!PyUnicode_Check(unicode)) {
10062 PyErr_BadInternalCall();
10063 return -1;
10064 }
10065 if (PyUnicode_READY(unicode) == -1)
10066 return -1;
10067 if (unicode_check_modifiable(unicode))
10068 return -1;
10069
Victor Stinnerd3f08822012-05-29 12:57:52 +020010070 if (start < 0) {
10071 PyErr_SetString(PyExc_IndexError, "string index out of range");
10072 return -1;
10073 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010074 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10075 PyErr_SetString(PyExc_ValueError,
10076 "fill character is bigger than "
10077 "the string maximum character");
10078 return -1;
10079 }
10080
10081 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10082 length = Py_MIN(maxlen, length);
10083 if (length <= 0)
10084 return 0;
10085
Victor Stinnerd3f08822012-05-29 12:57:52 +020010086 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010087 return length;
10088}
10089
Victor Stinner9310abb2011-10-05 00:59:23 +020010090static PyObject *
10091pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010092 Py_ssize_t left,
10093 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 PyObject *u;
10097 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010098 int kind;
10099 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
10101 if (left < 0)
10102 left = 0;
10103 if (right < 0)
10104 right = 0;
10105
Victor Stinnerc4b49542011-12-11 22:44:26 +010010106 if (left == 0 && right == 0)
10107 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10110 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010111 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10112 return NULL;
10113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010115 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010117 if (!u)
10118 return NULL;
10119
10120 kind = PyUnicode_KIND(u);
10121 data = PyUnicode_DATA(u);
10122 if (left)
10123 FILL(kind, data, fill, 0, left);
10124 if (right)
10125 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010126 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010127 assert(_PyUnicode_CheckConsistency(u, 1));
10128 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129}
10130
Alexander Belopolsky40018472011-02-26 01:02:56 +000010131PyObject *
10132PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010136 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
Benjamin Petersonead6b532011-12-20 17:23:42 -060010139 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010141 if (PyUnicode_IS_ASCII(string))
10142 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010143 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010144 PyUnicode_GET_LENGTH(string), keepends);
10145 else
10146 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010147 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 break;
10150 case PyUnicode_2BYTE_KIND:
10151 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 PyUnicode_GET_LENGTH(string), keepends);
10154 break;
10155 case PyUnicode_4BYTE_KIND:
10156 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010157 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 PyUnicode_GET_LENGTH(string), keepends);
10159 break;
10160 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010161 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164}
10165
Alexander Belopolsky40018472011-02-26 01:02:56 +000010166static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010167split(PyObject *self,
10168 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010169 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010171 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 void *buf1, *buf2;
10173 Py_ssize_t len1, len2;
10174 PyObject* out;
10175
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010177 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (PyUnicode_READY(self) == -1)
10180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010183 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 if (PyUnicode_IS_ASCII(self))
10186 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 PyUnicode_GET_LENGTH(self), maxcount
10189 );
10190 else
10191 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 PyUnicode_GET_LENGTH(self), maxcount
10194 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 case PyUnicode_2BYTE_KIND:
10196 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 PyUnicode_GET_LENGTH(self), maxcount
10199 );
10200 case PyUnicode_4BYTE_KIND:
10201 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyUnicode_GET_LENGTH(self), maxcount
10204 );
10205 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010206 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 }
10208
10209 if (PyUnicode_READY(substring) == -1)
10210 return NULL;
10211
10212 kind1 = PyUnicode_KIND(self);
10213 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 len1 = PyUnicode_GET_LENGTH(self);
10215 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010216 if (kind1 < kind2 || len1 < len2) {
10217 out = PyList_New(1);
10218 if (out == NULL)
10219 return NULL;
10220 Py_INCREF(self);
10221 PyList_SET_ITEM(out, 0, self);
10222 return out;
10223 }
10224 buf1 = PyUnicode_DATA(self);
10225 buf2 = PyUnicode_DATA(substring);
10226 if (kind2 != kind1) {
10227 buf2 = _PyUnicode_AsKind(substring, kind1);
10228 if (!buf2)
10229 return NULL;
10230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010232 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010234 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10235 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010236 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 else
10238 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 break;
10241 case PyUnicode_2BYTE_KIND:
10242 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 break;
10245 case PyUnicode_4BYTE_KIND:
10246 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 break;
10249 default:
10250 out = NULL;
10251 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010252 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 PyMem_Free(buf2);
10254 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255}
10256
Alexander Belopolsky40018472011-02-26 01:02:56 +000010257static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010258rsplit(PyObject *self,
10259 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010260 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010261{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010262 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 void *buf1, *buf2;
10264 Py_ssize_t len1, len2;
10265 PyObject* out;
10266
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010267 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010268 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (PyUnicode_READY(self) == -1)
10271 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010274 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010276 if (PyUnicode_IS_ASCII(self))
10277 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010278 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010279 PyUnicode_GET_LENGTH(self), maxcount
10280 );
10281 else
10282 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 PyUnicode_GET_LENGTH(self), maxcount
10285 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 case PyUnicode_2BYTE_KIND:
10287 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 PyUnicode_GET_LENGTH(self), maxcount
10290 );
10291 case PyUnicode_4BYTE_KIND:
10292 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyUnicode_GET_LENGTH(self), maxcount
10295 );
10296 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010297 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 }
10299
10300 if (PyUnicode_READY(substring) == -1)
10301 return NULL;
10302
10303 kind1 = PyUnicode_KIND(self);
10304 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 len1 = PyUnicode_GET_LENGTH(self);
10306 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010307 if (kind1 < kind2 || len1 < len2) {
10308 out = PyList_New(1);
10309 if (out == NULL)
10310 return NULL;
10311 Py_INCREF(self);
10312 PyList_SET_ITEM(out, 0, self);
10313 return out;
10314 }
10315 buf1 = PyUnicode_DATA(self);
10316 buf2 = PyUnicode_DATA(substring);
10317 if (kind2 != kind1) {
10318 buf2 = _PyUnicode_AsKind(substring, kind1);
10319 if (!buf2)
10320 return NULL;
10321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010323 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10326 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 else
10329 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 break;
10332 case PyUnicode_2BYTE_KIND:
10333 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 break;
10336 case PyUnicode_4BYTE_KIND:
10337 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 break;
10340 default:
10341 out = NULL;
10342 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010343 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 PyMem_Free(buf2);
10345 return out;
10346}
10347
10348static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10350 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010352 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010354 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10355 return asciilib_find(buf1, len1, buf2, len2, offset);
10356 else
10357 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 case PyUnicode_2BYTE_KIND:
10359 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10360 case PyUnicode_4BYTE_KIND:
10361 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10362 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010363 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364}
10365
10366static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10368 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010370 switch (kind) {
10371 case PyUnicode_1BYTE_KIND:
10372 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10373 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10374 else
10375 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10376 case PyUnicode_2BYTE_KIND:
10377 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10378 case PyUnicode_4BYTE_KIND:
10379 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10380 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010381 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010382}
10383
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010384static void
10385replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10386 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10387{
10388 int kind = PyUnicode_KIND(u);
10389 void *data = PyUnicode_DATA(u);
10390 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10391 if (kind == PyUnicode_1BYTE_KIND) {
10392 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10393 (Py_UCS1 *)data + len,
10394 u1, u2, maxcount);
10395 }
10396 else if (kind == PyUnicode_2BYTE_KIND) {
10397 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10398 (Py_UCS2 *)data + len,
10399 u1, u2, maxcount);
10400 }
10401 else {
10402 assert(kind == PyUnicode_4BYTE_KIND);
10403 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10404 (Py_UCS4 *)data + len,
10405 u1, u2, maxcount);
10406 }
10407}
10408
Alexander Belopolsky40018472011-02-26 01:02:56 +000010409static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410replace(PyObject *self, PyObject *str1,
10411 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 PyObject *u;
10414 char *sbuf = PyUnicode_DATA(self);
10415 char *buf1 = PyUnicode_DATA(str1);
10416 char *buf2 = PyUnicode_DATA(str2);
10417 int srelease = 0, release1 = 0, release2 = 0;
10418 int skind = PyUnicode_KIND(self);
10419 int kind1 = PyUnicode_KIND(str1);
10420 int kind2 = PyUnicode_KIND(str2);
10421 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10422 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10423 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010424 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010425 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426
10427 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010430 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431
Victor Stinner59de0ee2011-10-07 10:01:28 +020010432 if (str1 == str2)
10433 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434
Victor Stinner49a0a212011-10-12 23:46:10 +020010435 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010436 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10437 if (maxchar < maxchar_str1)
10438 /* substring too wide to be present */
10439 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010440 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10441 /* Replacing str1 with str2 may cause a maxchar reduction in the
10442 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010443 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010444 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010447 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010449 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010452 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010453 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010454
Victor Stinner69ed0f42013-04-09 21:48:24 +020010455 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010457 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010458 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010459 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010463
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010464 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10465 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010466 }
10467 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 int rkind = skind;
10469 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010470 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (kind1 < rkind) {
10473 /* widen substring */
10474 buf1 = _PyUnicode_AsKind(str1, rkind);
10475 if (!buf1) goto error;
10476 release1 = 1;
10477 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010478 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479 if (i < 0)
10480 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 if (rkind > kind2) {
10482 /* widen replacement */
10483 buf2 = _PyUnicode_AsKind(str2, rkind);
10484 if (!buf2) goto error;
10485 release2 = 1;
10486 }
10487 else if (rkind < kind2) {
10488 /* widen self and buf1 */
10489 rkind = kind2;
10490 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010491 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 sbuf = _PyUnicode_AsKind(self, rkind);
10493 if (!sbuf) goto error;
10494 srelease = 1;
10495 buf1 = _PyUnicode_AsKind(str1, rkind);
10496 if (!buf1) goto error;
10497 release1 = 1;
10498 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010499 u = PyUnicode_New(slen, maxchar);
10500 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010502 assert(PyUnicode_KIND(u) == rkind);
10503 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010504
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010505 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010506 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010507 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010509 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010511
10512 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010513 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010514 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010515 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010516 if (i == -1)
10517 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010518 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010520 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010524 }
10525 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010527 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 int rkind = skind;
10529 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010532 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 buf1 = _PyUnicode_AsKind(str1, rkind);
10534 if (!buf1) goto error;
10535 release1 = 1;
10536 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010537 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 if (n == 0)
10539 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 buf2 = _PyUnicode_AsKind(str2, rkind);
10543 if (!buf2) goto error;
10544 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010547 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 rkind = kind2;
10549 sbuf = _PyUnicode_AsKind(self, rkind);
10550 if (!sbuf) goto error;
10551 srelease = 1;
10552 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010553 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 buf1 = _PyUnicode_AsKind(str1, rkind);
10555 if (!buf1) goto error;
10556 release1 = 1;
10557 }
10558 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10559 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010560 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 PyErr_SetString(PyExc_OverflowError,
10562 "replace string is too long");
10563 goto error;
10564 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010565 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010566 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010567 _Py_INCREF_UNICODE_EMPTY();
10568 if (!unicode_empty)
10569 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010570 u = unicode_empty;
10571 goto done;
10572 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010573 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 PyErr_SetString(PyExc_OverflowError,
10575 "replace string is too long");
10576 goto error;
10577 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 u = PyUnicode_New(new_size, maxchar);
10579 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 assert(PyUnicode_KIND(u) == rkind);
10582 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 ires = i = 0;
10584 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010585 while (n-- > 0) {
10586 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590 if (j == -1)
10591 break;
10592 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 memcpy(res + rkind * ires,
10595 sbuf + rkind * i,
10596 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 }
10599 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010601 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010603 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010610 memcpy(res + rkind * ires,
10611 sbuf + rkind * i,
10612 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 }
10614 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010615 /* interleave */
10616 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 if (--n <= 0)
10622 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 memcpy(res + rkind * ires,
10624 sbuf + rkind * i,
10625 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 ires++;
10627 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010628 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010629 memcpy(res + rkind * ires,
10630 sbuf + rkind * i,
10631 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010633 }
10634
10635 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010636 unicode_adjust_maxchar(&u);
10637 if (u == NULL)
10638 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010640
10641 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (srelease)
10643 PyMem_FREE(sbuf);
10644 if (release1)
10645 PyMem_FREE(buf1);
10646 if (release2)
10647 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010648 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (srelease)
10654 PyMem_FREE(sbuf);
10655 if (release1)
10656 PyMem_FREE(buf1);
10657 if (release2)
10658 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010659 return unicode_result_unchanged(self);
10660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 error:
10662 if (srelease && sbuf)
10663 PyMem_FREE(sbuf);
10664 if (release1 && buf1)
10665 PyMem_FREE(buf1);
10666 if (release2 && buf2)
10667 PyMem_FREE(buf2);
10668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669}
10670
10671/* --- Unicode Object Methods --------------------------------------------- */
10672
INADA Naoki3ae20562017-01-16 20:41:20 +090010673/*[clinic input]
10674str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675
INADA Naoki3ae20562017-01-16 20:41:20 +090010676Return a version of the string where each word is titlecased.
10677
10678More specifically, words start with uppercased characters and all remaining
10679cased characters have lower case.
10680[clinic start generated code]*/
10681
10682static PyObject *
10683unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010684/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010686 if (PyUnicode_READY(self) == -1)
10687 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010688 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689}
10690
INADA Naoki3ae20562017-01-16 20:41:20 +090010691/*[clinic input]
10692str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
INADA Naoki3ae20562017-01-16 20:41:20 +090010694Return a capitalized version of the string.
10695
10696More specifically, make the first character have upper case and the rest lower
10697case.
10698[clinic start generated code]*/
10699
10700static PyObject *
10701unicode_capitalize_impl(PyObject *self)
10702/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010704 if (PyUnicode_READY(self) == -1)
10705 return NULL;
10706 if (PyUnicode_GET_LENGTH(self) == 0)
10707 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010708 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709}
10710
INADA Naoki3ae20562017-01-16 20:41:20 +090010711/*[clinic input]
10712str.casefold as unicode_casefold
10713
10714Return a version of the string suitable for caseless comparisons.
10715[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010716
10717static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010718unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010719/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010720{
10721 if (PyUnicode_READY(self) == -1)
10722 return NULL;
10723 if (PyUnicode_IS_ASCII(self))
10724 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010725 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010726}
10727
10728
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010729/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010730
10731static int
10732convert_uc(PyObject *obj, void *addr)
10733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010735
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010736 if (!PyUnicode_Check(obj)) {
10737 PyErr_Format(PyExc_TypeError,
10738 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010739 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010740 return 0;
10741 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010742 if (PyUnicode_READY(obj) < 0)
10743 return 0;
10744 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 return 0;
10748 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010749 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010751}
10752
INADA Naoki3ae20562017-01-16 20:41:20 +090010753/*[clinic input]
10754str.center as unicode_center
10755
10756 width: Py_ssize_t
10757 fillchar: Py_UCS4 = ' '
10758 /
10759
10760Return a centered string of length width.
10761
10762Padding is done using the specified fill character (default is a space).
10763[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
10765static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010766unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10767/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010769 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Benjamin Petersonbac79492012-01-14 13:34:47 -050010771 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 return NULL;
10773
Victor Stinnerc4b49542011-12-11 22:44:26 +010010774 if (PyUnicode_GET_LENGTH(self) >= width)
10775 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776
Victor Stinnerc4b49542011-12-11 22:44:26 +010010777 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 left = marg / 2 + (marg & width & 1);
10779
Victor Stinner9310abb2011-10-05 00:59:23 +020010780 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781}
10782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783/* This function assumes that str1 and str2 are readied by the caller. */
10784
Marc-André Lemburge5034372000-08-08 08:04:29 +000010785static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010786unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010787{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010788#define COMPARE(TYPE1, TYPE2) \
10789 do { \
10790 TYPE1* p1 = (TYPE1 *)data1; \
10791 TYPE2* p2 = (TYPE2 *)data2; \
10792 TYPE1* end = p1 + len; \
10793 Py_UCS4 c1, c2; \
10794 for (; p1 != end; p1++, p2++) { \
10795 c1 = *p1; \
10796 c2 = *p2; \
10797 if (c1 != c2) \
10798 return (c1 < c2) ? -1 : 1; \
10799 } \
10800 } \
10801 while (0)
10802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 int kind1, kind2;
10804 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010805 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 kind1 = PyUnicode_KIND(str1);
10808 kind2 = PyUnicode_KIND(str2);
10809 data1 = PyUnicode_DATA(str1);
10810 data2 = PyUnicode_DATA(str2);
10811 len1 = PyUnicode_GET_LENGTH(str1);
10812 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010813 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010814
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010815 switch(kind1) {
10816 case PyUnicode_1BYTE_KIND:
10817 {
10818 switch(kind2) {
10819 case PyUnicode_1BYTE_KIND:
10820 {
10821 int cmp = memcmp(data1, data2, len);
10822 /* normalize result of memcmp() into the range [-1; 1] */
10823 if (cmp < 0)
10824 return -1;
10825 if (cmp > 0)
10826 return 1;
10827 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010828 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010829 case PyUnicode_2BYTE_KIND:
10830 COMPARE(Py_UCS1, Py_UCS2);
10831 break;
10832 case PyUnicode_4BYTE_KIND:
10833 COMPARE(Py_UCS1, Py_UCS4);
10834 break;
10835 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010836 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010837 }
10838 break;
10839 }
10840 case PyUnicode_2BYTE_KIND:
10841 {
10842 switch(kind2) {
10843 case PyUnicode_1BYTE_KIND:
10844 COMPARE(Py_UCS2, Py_UCS1);
10845 break;
10846 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010847 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010848 COMPARE(Py_UCS2, Py_UCS2);
10849 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010850 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010851 case PyUnicode_4BYTE_KIND:
10852 COMPARE(Py_UCS2, Py_UCS4);
10853 break;
10854 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010855 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 }
10857 break;
10858 }
10859 case PyUnicode_4BYTE_KIND:
10860 {
10861 switch(kind2) {
10862 case PyUnicode_1BYTE_KIND:
10863 COMPARE(Py_UCS4, Py_UCS1);
10864 break;
10865 case PyUnicode_2BYTE_KIND:
10866 COMPARE(Py_UCS4, Py_UCS2);
10867 break;
10868 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010869 {
10870#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10871 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10872 /* normalize result of wmemcmp() into the range [-1; 1] */
10873 if (cmp < 0)
10874 return -1;
10875 if (cmp > 0)
10876 return 1;
10877#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010879#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010881 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010883 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 }
10885 break;
10886 }
10887 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010888 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010889 }
10890
Victor Stinner770e19e2012-10-04 22:59:45 +020010891 if (len1 == len2)
10892 return 0;
10893 if (len1 < len2)
10894 return -1;
10895 else
10896 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010897
10898#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010899}
10900
Benjamin Peterson621b4302016-09-09 13:54:34 -070010901static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010902unicode_compare_eq(PyObject *str1, PyObject *str2)
10903{
10904 int kind;
10905 void *data1, *data2;
10906 Py_ssize_t len;
10907 int cmp;
10908
Victor Stinnere5567ad2012-10-23 02:48:49 +020010909 len = PyUnicode_GET_LENGTH(str1);
10910 if (PyUnicode_GET_LENGTH(str2) != len)
10911 return 0;
10912 kind = PyUnicode_KIND(str1);
10913 if (PyUnicode_KIND(str2) != kind)
10914 return 0;
10915 data1 = PyUnicode_DATA(str1);
10916 data2 = PyUnicode_DATA(str2);
10917
10918 cmp = memcmp(data1, data2, len * kind);
10919 return (cmp == 0);
10920}
10921
10922
Alexander Belopolsky40018472011-02-26 01:02:56 +000010923int
10924PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10927 if (PyUnicode_READY(left) == -1 ||
10928 PyUnicode_READY(right) == -1)
10929 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010930
10931 /* a string is equal to itself */
10932 if (left == right)
10933 return 0;
10934
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010935 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010937 PyErr_Format(PyExc_TypeError,
10938 "Can't compare %.100s and %.100s",
10939 left->ob_type->tp_name,
10940 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941 return -1;
10942}
10943
Martin v. Löwis5b222132007-06-10 09:51:05 +000010944int
10945PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 Py_ssize_t i;
10948 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010950 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951
Victor Stinner910337b2011-10-03 03:20:16 +020010952 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010953 if (!PyUnicode_IS_READY(uni)) {
10954 const wchar_t *ws = _PyUnicode_WSTR(uni);
10955 /* Compare Unicode string and source character set string */
10956 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10957 if (chr != ustr[i])
10958 return (chr < ustr[i]) ? -1 : 1;
10959 }
10960 /* This check keeps Python strings that end in '\0' from comparing equal
10961 to C strings identical up to that point. */
10962 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10963 return 1; /* uni is longer */
10964 if (ustr[i])
10965 return -1; /* str is longer */
10966 return 0;
10967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010969 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010970 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010971 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010972 size_t len, len2 = strlen(str);
10973 int cmp;
10974
10975 len = Py_MIN(len1, len2);
10976 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010977 if (cmp != 0) {
10978 if (cmp < 0)
10979 return -1;
10980 else
10981 return 1;
10982 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010983 if (len1 > len2)
10984 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010985 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010986 return -1; /* str is longer */
10987 return 0;
10988 }
10989 else {
10990 void *data = PyUnicode_DATA(uni);
10991 /* Compare Unicode string and source character set string */
10992 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010993 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010994 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10995 /* This check keeps Python strings that end in '\0' from comparing equal
10996 to C strings identical up to that point. */
10997 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10998 return 1; /* uni is longer */
10999 if (str[i])
11000 return -1; /* str is longer */
11001 return 0;
11002 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011003}
11004
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011005static int
11006non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11007{
11008 size_t i, len;
11009 const wchar_t *p;
11010 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11011 if (strlen(str) != len)
11012 return 0;
11013 p = _PyUnicode_WSTR(unicode);
11014 assert(p);
11015 for (i = 0; i < len; i++) {
11016 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011017 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011018 return 0;
11019 }
11020 return 1;
11021}
11022
11023int
11024_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11025{
11026 size_t len;
11027 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011028 assert(str);
11029#ifndef NDEBUG
11030 for (const char *p = str; *p; p++) {
11031 assert((unsigned char)*p < 128);
11032 }
11033#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011034 if (PyUnicode_READY(unicode) == -1) {
11035 /* Memory error or bad data */
11036 PyErr_Clear();
11037 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11038 }
11039 if (!PyUnicode_IS_ASCII(unicode))
11040 return 0;
11041 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11042 return strlen(str) == len &&
11043 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11044}
11045
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011046int
11047_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11048{
11049 PyObject *right_uni;
11050 Py_hash_t hash;
11051
11052 assert(_PyUnicode_CHECK(left));
11053 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011054#ifndef NDEBUG
11055 for (const char *p = right->string; *p; p++) {
11056 assert((unsigned char)*p < 128);
11057 }
11058#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011059
11060 if (PyUnicode_READY(left) == -1) {
11061 /* memory error or bad data */
11062 PyErr_Clear();
11063 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11064 }
11065
11066 if (!PyUnicode_IS_ASCII(left))
11067 return 0;
11068
11069 right_uni = _PyUnicode_FromId(right); /* borrowed */
11070 if (right_uni == NULL) {
11071 /* memory error or bad data */
11072 PyErr_Clear();
11073 return _PyUnicode_EqualToASCIIString(left, right->string);
11074 }
11075
11076 if (left == right_uni)
11077 return 1;
11078
11079 if (PyUnicode_CHECK_INTERNED(left))
11080 return 0;
11081
INADA Naoki7cc95f52018-01-28 02:07:09 +090011082 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011083 hash = _PyUnicode_HASH(left);
11084 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11085 return 0;
11086
11087 return unicode_compare_eq(left, right_uni);
11088}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011089
Alexander Belopolsky40018472011-02-26 01:02:56 +000011090PyObject *
11091PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011092{
11093 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011094
Victor Stinnere5567ad2012-10-23 02:48:49 +020011095 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11096 Py_RETURN_NOTIMPLEMENTED;
11097
11098 if (PyUnicode_READY(left) == -1 ||
11099 PyUnicode_READY(right) == -1)
11100 return NULL;
11101
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011102 if (left == right) {
11103 switch (op) {
11104 case Py_EQ:
11105 case Py_LE:
11106 case Py_GE:
11107 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011108 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011109 case Py_NE:
11110 case Py_LT:
11111 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011112 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011113 default:
11114 PyErr_BadArgument();
11115 return NULL;
11116 }
11117 }
11118 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011119 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011120 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011121 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011122 }
11123 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011124 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011125 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011126 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011127}
11128
Alexander Belopolsky40018472011-02-26 01:02:56 +000011129int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011130_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11131{
11132 return unicode_eq(aa, bb);
11133}
11134
11135int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011136PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011137{
Victor Stinner77282cb2013-04-14 19:22:47 +020011138 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 void *buf1, *buf2;
11140 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011141 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011142
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011143 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011145 "'in <string>' requires string as left operand, not %.100s",
11146 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011148 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011150 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011151 if (ensure_unicode(str) < 0)
11152 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011155 kind2 = PyUnicode_KIND(substr);
11156 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011157 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 len2 = PyUnicode_GET_LENGTH(substr);
11160 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011161 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011162 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011163 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011164 if (len2 == 1) {
11165 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11166 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011167 return result;
11168 }
11169 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011170 buf2 = _PyUnicode_AsKind(substr, kind1);
11171 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011172 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174
Victor Stinner77282cb2013-04-14 19:22:47 +020011175 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 case PyUnicode_1BYTE_KIND:
11177 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11178 break;
11179 case PyUnicode_2BYTE_KIND:
11180 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11181 break;
11182 case PyUnicode_4BYTE_KIND:
11183 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11184 break;
11185 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011186 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011188
Victor Stinner77282cb2013-04-14 19:22:47 +020011189 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 PyMem_Free(buf2);
11191
Guido van Rossum403d68b2000-03-13 15:55:09 +000011192 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011193}
11194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195/* Concat to string or Unicode object giving a new Unicode object. */
11196
Alexander Belopolsky40018472011-02-26 01:02:56 +000011197PyObject *
11198PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011201 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011204 if (ensure_unicode(left) < 0)
11205 return NULL;
11206
11207 if (!PyUnicode_Check(right)) {
11208 PyErr_Format(PyExc_TypeError,
11209 "can only concatenate str (not \"%.200s\") to str",
11210 right->ob_type->tp_name);
11211 return NULL;
11212 }
11213 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
11216 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217 if (left == unicode_empty)
11218 return PyUnicode_FromObject(right);
11219 if (right == unicode_empty)
11220 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 left_len = PyUnicode_GET_LENGTH(left);
11223 right_len = PyUnicode_GET_LENGTH(right);
11224 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011225 PyErr_SetString(PyExc_OverflowError,
11226 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011228 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011230
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11232 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011233 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 result = PyUnicode_New(new_len, maxchar);
11237 if (result == NULL)
11238 return NULL;
11239 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11240 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11241 assert(_PyUnicode_CheckConsistency(result, 1));
11242 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243}
11244
Walter Dörwald1ab83302007-05-18 17:15:44 +000011245void
Victor Stinner23e56682011-10-03 03:54:37 +020011246PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011247{
Victor Stinner23e56682011-10-03 03:54:37 +020011248 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011249 Py_UCS4 maxchar, maxchar2;
11250 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011251
11252 if (p_left == NULL) {
11253 if (!PyErr_Occurred())
11254 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011255 return;
11256 }
Victor Stinner23e56682011-10-03 03:54:37 +020011257 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011258 if (right == NULL || left == NULL
11259 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011260 if (!PyErr_Occurred())
11261 PyErr_BadInternalCall();
11262 goto error;
11263 }
11264
Benjamin Petersonbac79492012-01-14 13:34:47 -050011265 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011266 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011267 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011268 goto error;
11269
Victor Stinner488fa492011-12-12 00:01:39 +010011270 /* Shortcuts */
11271 if (left == unicode_empty) {
11272 Py_DECREF(left);
11273 Py_INCREF(right);
11274 *p_left = right;
11275 return;
11276 }
11277 if (right == unicode_empty)
11278 return;
11279
11280 left_len = PyUnicode_GET_LENGTH(left);
11281 right_len = PyUnicode_GET_LENGTH(right);
11282 if (left_len > PY_SSIZE_T_MAX - right_len) {
11283 PyErr_SetString(PyExc_OverflowError,
11284 "strings are too large to concat");
11285 goto error;
11286 }
11287 new_len = left_len + right_len;
11288
11289 if (unicode_modifiable(left)
11290 && PyUnicode_CheckExact(right)
11291 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011292 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11293 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011294 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011295 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011296 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11297 {
11298 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011299 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011300 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011301
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011302 /* copy 'right' into the newly allocated area of 'left' */
11303 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011304 }
Victor Stinner488fa492011-12-12 00:01:39 +010011305 else {
11306 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11307 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011308 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011309
Victor Stinner488fa492011-12-12 00:01:39 +010011310 /* Concat the two Unicode strings */
11311 res = PyUnicode_New(new_len, maxchar);
11312 if (res == NULL)
11313 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011314 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11315 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011316 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011317 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011318 }
11319 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011320 return;
11321
11322error:
Victor Stinner488fa492011-12-12 00:01:39 +010011323 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011324}
11325
11326void
11327PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 PyUnicode_Append(pleft, right);
11330 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011331}
11332
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011333/*
11334Wraps stringlib_parse_args_finds() and additionally ensures that the
11335first argument is a unicode object.
11336*/
11337
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011338static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011339parse_args_finds_unicode(const char * function_name, PyObject *args,
11340 PyObject **substring,
11341 Py_ssize_t *start, Py_ssize_t *end)
11342{
11343 if(stringlib_parse_args_finds(function_name, args, substring,
11344 start, end)) {
11345 if (ensure_unicode(*substring) < 0)
11346 return 0;
11347 return 1;
11348 }
11349 return 0;
11350}
11351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011355Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011356string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
11359static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011360unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011362 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011363 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011364 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011366 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 void *buf1, *buf2;
11368 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011370 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 kind1 = PyUnicode_KIND(self);
11374 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011376 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 len1 = PyUnicode_GET_LENGTH(self);
11379 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011381 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011382 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011383
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011384 buf1 = PyUnicode_DATA(self);
11385 buf2 = PyUnicode_DATA(substring);
11386 if (kind2 != kind1) {
11387 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011388 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011389 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011390 }
11391 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 case PyUnicode_1BYTE_KIND:
11393 iresult = ucs1lib_count(
11394 ((Py_UCS1*)buf1) + start, end - start,
11395 buf2, len2, PY_SSIZE_T_MAX
11396 );
11397 break;
11398 case PyUnicode_2BYTE_KIND:
11399 iresult = ucs2lib_count(
11400 ((Py_UCS2*)buf1) + start, end - start,
11401 buf2, len2, PY_SSIZE_T_MAX
11402 );
11403 break;
11404 case PyUnicode_4BYTE_KIND:
11405 iresult = ucs4lib_count(
11406 ((Py_UCS4*)buf1) + start, end - start,
11407 buf2, len2, PY_SSIZE_T_MAX
11408 );
11409 break;
11410 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011411 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 }
11413
11414 result = PyLong_FromSsize_t(iresult);
11415
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011416 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419 return result;
11420}
11421
INADA Naoki3ae20562017-01-16 20:41:20 +090011422/*[clinic input]
11423str.encode as unicode_encode
11424
11425 encoding: str(c_default="NULL") = 'utf-8'
11426 The encoding in which to encode the string.
11427 errors: str(c_default="NULL") = 'strict'
11428 The error handling scheme to use for encoding errors.
11429 The default is 'strict' meaning that encoding errors raise a
11430 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11431 'xmlcharrefreplace' as well as any other name registered with
11432 codecs.register_error that can handle UnicodeEncodeErrors.
11433
11434Encode the string using the codec registered for encoding.
11435[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
11437static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011438unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011439/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011441 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011442}
11443
INADA Naoki3ae20562017-01-16 20:41:20 +090011444/*[clinic input]
11445str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
INADA Naoki3ae20562017-01-16 20:41:20 +090011447 tabsize: int = 8
11448
11449Return a copy where all tab characters are expanded using spaces.
11450
11451If tabsize is not given, a tab size of 8 characters is assumed.
11452[clinic start generated code]*/
11453
11454static PyObject *
11455unicode_expandtabs_impl(PyObject *self, int tabsize)
11456/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011458 Py_ssize_t i, j, line_pos, src_len, incr;
11459 Py_UCS4 ch;
11460 PyObject *u;
11461 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011463 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
Antoine Pitrou22425222011-10-04 19:10:51 +020011465 if (PyUnicode_READY(self) == -1)
11466 return NULL;
11467
Thomas Wouters7e474022000-07-16 12:04:32 +000011468 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 src_len = PyUnicode_GET_LENGTH(self);
11470 i = j = line_pos = 0;
11471 kind = PyUnicode_KIND(self);
11472 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011473 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 for (; i < src_len; i++) {
11475 ch = PyUnicode_READ(kind, src_data, i);
11476 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011477 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011479 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 goto overflow;
11482 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011484 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 goto overflow;
11489 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011491 if (ch == '\n' || ch == '\r')
11492 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011494 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011495 if (!found)
11496 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011497
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 if (!u)
11501 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011502 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
Antoine Pitroue71d5742011-10-04 15:55:09 +020011506 for (; i < src_len; i++) {
11507 ch = PyUnicode_READ(kind, src_data, i);
11508 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 incr = tabsize - (line_pos % tabsize);
11511 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011512 FILL(kind, dest_data, ' ', j, incr);
11513 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011515 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011517 line_pos++;
11518 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011519 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 if (ch == '\n' || ch == '\r')
11521 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011523 }
11524 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011525 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011526
Antoine Pitroue71d5742011-10-04 15:55:09 +020011527 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011528 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530}
11531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534\n\
11535Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011536such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537arguments start and end are interpreted as in slice notation.\n\
11538\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011539Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
11541static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011544 /* initialize variables to prevent gcc warning */
11545 PyObject *substring = NULL;
11546 Py_ssize_t start = 0;
11547 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011548 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011550 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011553 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011556 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 if (result == -2)
11559 return NULL;
11560
Christian Heimes217cfd12007-12-02 14:31:20 +000011561 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562}
11563
11564static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011565unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011567 void *data;
11568 enum PyUnicode_Kind kind;
11569 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011570
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011571 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011572 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011574 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011575 if (PyUnicode_READY(self) == -1) {
11576 return NULL;
11577 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011578 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11579 PyErr_SetString(PyExc_IndexError, "string index out of range");
11580 return NULL;
11581 }
11582 kind = PyUnicode_KIND(self);
11583 data = PyUnicode_DATA(self);
11584 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011585 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586}
11587
Guido van Rossumc2504932007-09-18 19:42:40 +000011588/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011589 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011590static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011591unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592{
Guido van Rossumc2504932007-09-18 19:42:40 +000011593 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011594 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011595
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011596#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011597 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011598#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 if (_PyUnicode_HASH(self) != -1)
11600 return _PyUnicode_HASH(self);
11601 if (PyUnicode_READY(self) == -1)
11602 return -1;
11603 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011604 /*
11605 We make the hash of the empty string be 0, rather than using
11606 (prefix ^ suffix), since this slightly obfuscates the hash secret
11607 */
11608 if (len == 0) {
11609 _PyUnicode_HASH(self) = 0;
11610 return 0;
11611 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011612 x = _Py_HashBytes(PyUnicode_DATA(self),
11613 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011615 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616}
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620\n\
oldkaa0735f2018-02-02 16:52:55 +080011621Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011622such that sub is contained within S[start:end]. Optional\n\
11623arguments start and end are interpreted as in slice notation.\n\
11624\n\
11625Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
11627static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011630 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011631 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011632 PyObject *substring = NULL;
11633 Py_ssize_t start = 0;
11634 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011636 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011639 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011642 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 if (result == -2)
11645 return NULL;
11646
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647 if (result < 0) {
11648 PyErr_SetString(PyExc_ValueError, "substring not found");
11649 return NULL;
11650 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651
Christian Heimes217cfd12007-12-02 14:31:20 +000011652 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
INADA Naoki3ae20562017-01-16 20:41:20 +090011655/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011656str.isascii as unicode_isascii
11657
11658Return True if all characters in the string are ASCII, False otherwise.
11659
11660ASCII characters have code points in the range U+0000-U+007F.
11661Empty string is ASCII too.
11662[clinic start generated code]*/
11663
11664static PyObject *
11665unicode_isascii_impl(PyObject *self)
11666/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11667{
11668 if (PyUnicode_READY(self) == -1) {
11669 return NULL;
11670 }
11671 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11672}
11673
11674/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011675str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
INADA Naoki3ae20562017-01-16 20:41:20 +090011677Return True if the string is a lowercase string, False otherwise.
11678
11679A string is lowercase if all cased characters in the string are lowercase and
11680there is at least one cased character in the string.
11681[clinic start generated code]*/
11682
11683static PyObject *
11684unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011685/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 Py_ssize_t i, length;
11688 int kind;
11689 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 int cased;
11691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (PyUnicode_READY(self) == -1)
11693 return NULL;
11694 length = PyUnicode_GET_LENGTH(self);
11695 kind = PyUnicode_KIND(self);
11696 data = PyUnicode_DATA(self);
11697
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (length == 1)
11700 return PyBool_FromLong(
11701 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011703 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011705 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011706
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 for (i = 0; i < length; i++) {
11709 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011710
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011712 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 else if (!cased && Py_UNICODE_ISLOWER(ch))
11714 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011716 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717}
11718
INADA Naoki3ae20562017-01-16 20:41:20 +090011719/*[clinic input]
11720str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
INADA Naoki3ae20562017-01-16 20:41:20 +090011722Return True if the string is an uppercase string, False otherwise.
11723
11724A string is uppercase if all cased characters in the string are uppercase and
11725there is at least one cased character in the string.
11726[clinic start generated code]*/
11727
11728static PyObject *
11729unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011730/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 Py_ssize_t i, length;
11733 int kind;
11734 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735 int cased;
11736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (PyUnicode_READY(self) == -1)
11738 return NULL;
11739 length = PyUnicode_GET_LENGTH(self);
11740 kind = PyUnicode_KIND(self);
11741 data = PyUnicode_DATA(self);
11742
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (length == 1)
11745 return PyBool_FromLong(
11746 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011748 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011750 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011751
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 for (i = 0; i < length; i++) {
11754 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011755
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011757 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 else if (!cased && Py_UNICODE_ISUPPER(ch))
11759 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011761 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762}
11763
INADA Naoki3ae20562017-01-16 20:41:20 +090011764/*[clinic input]
11765str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
INADA Naoki3ae20562017-01-16 20:41:20 +090011767Return True if the string is a title-cased string, False otherwise.
11768
11769In a title-cased string, upper- and title-case characters may only
11770follow uncased characters and lowercase characters only cased ones.
11771[clinic start generated code]*/
11772
11773static PyObject *
11774unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011775/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 Py_ssize_t i, length;
11778 int kind;
11779 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 int cased, previous_is_cased;
11781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (PyUnicode_READY(self) == -1)
11783 return NULL;
11784 length = PyUnicode_GET_LENGTH(self);
11785 kind = PyUnicode_KIND(self);
11786 data = PyUnicode_DATA(self);
11787
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 if (length == 1) {
11790 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11791 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11792 (Py_UNICODE_ISUPPER(ch) != 0));
11793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011795 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011797 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 cased = 0;
11800 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 for (i = 0; i < length; i++) {
11802 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011803
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11805 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011806 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 previous_is_cased = 1;
11808 cased = 1;
11809 }
11810 else if (Py_UNICODE_ISLOWER(ch)) {
11811 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011812 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 previous_is_cased = 1;
11814 cased = 1;
11815 }
11816 else
11817 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011819 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820}
11821
INADA Naoki3ae20562017-01-16 20:41:20 +090011822/*[clinic input]
11823str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824
INADA Naoki3ae20562017-01-16 20:41:20 +090011825Return True if the string is a whitespace string, False otherwise.
11826
11827A string is whitespace if all characters in the string are whitespace and there
11828is at least one character in the string.
11829[clinic start generated code]*/
11830
11831static PyObject *
11832unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011833/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 Py_ssize_t i, length;
11836 int kind;
11837 void *data;
11838
11839 if (PyUnicode_READY(self) == -1)
11840 return NULL;
11841 length = PyUnicode_GET_LENGTH(self);
11842 kind = PyUnicode_KIND(self);
11843 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 if (length == 1)
11847 return PyBool_FromLong(
11848 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011850 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011852 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 for (i = 0; i < length; i++) {
11855 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011856 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011857 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011859 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860}
11861
INADA Naoki3ae20562017-01-16 20:41:20 +090011862/*[clinic input]
11863str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011864
INADA Naoki3ae20562017-01-16 20:41:20 +090011865Return True if the string is an alphabetic string, False otherwise.
11866
11867A string is alphabetic if all characters in the string are alphabetic and there
11868is at least one character in the string.
11869[clinic start generated code]*/
11870
11871static PyObject *
11872unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011873/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 Py_ssize_t i, length;
11876 int kind;
11877 void *data;
11878
11879 if (PyUnicode_READY(self) == -1)
11880 return NULL;
11881 length = PyUnicode_GET_LENGTH(self);
11882 kind = PyUnicode_KIND(self);
11883 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011884
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011885 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (length == 1)
11887 return PyBool_FromLong(
11888 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011889
11890 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011892 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 for (i = 0; i < length; i++) {
11895 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011896 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011897 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011898 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011899}
11900
INADA Naoki3ae20562017-01-16 20:41:20 +090011901/*[clinic input]
11902str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904Return True if the string is an alpha-numeric string, False otherwise.
11905
11906A string is alpha-numeric if all characters in the string are alpha-numeric and
11907there is at least one character in the string.
11908[clinic start generated code]*/
11909
11910static PyObject *
11911unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011912/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 int kind;
11915 void *data;
11916 Py_ssize_t len, i;
11917
11918 if (PyUnicode_READY(self) == -1)
11919 return NULL;
11920
11921 kind = PyUnicode_KIND(self);
11922 data = PyUnicode_DATA(self);
11923 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011924
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011925 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (len == 1) {
11927 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11928 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11929 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930
11931 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011933 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 for (i = 0; i < len; i++) {
11936 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011937 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941}
11942
INADA Naoki3ae20562017-01-16 20:41:20 +090011943/*[clinic input]
11944str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
INADA Naoki3ae20562017-01-16 20:41:20 +090011946Return True if the string is a decimal string, False otherwise.
11947
11948A string is a decimal string if all characters in the string are decimal and
11949there is at least one character in the string.
11950[clinic start generated code]*/
11951
11952static PyObject *
11953unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011954/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 Py_ssize_t i, length;
11957 int kind;
11958 void *data;
11959
11960 if (PyUnicode_READY(self) == -1)
11961 return NULL;
11962 length = PyUnicode_GET_LENGTH(self);
11963 kind = PyUnicode_KIND(self);
11964 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (length == 1)
11968 return PyBool_FromLong(
11969 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011971 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011973 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 for (i = 0; i < length; i++) {
11976 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011977 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011979 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980}
11981
INADA Naoki3ae20562017-01-16 20:41:20 +090011982/*[clinic input]
11983str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
INADA Naoki3ae20562017-01-16 20:41:20 +090011985Return True if the string is a digit string, False otherwise.
11986
11987A string is a digit string if all characters in the string are digits and there
11988is at least one character in the string.
11989[clinic start generated code]*/
11990
11991static PyObject *
11992unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011993/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 Py_ssize_t i, length;
11996 int kind;
11997 void *data;
11998
11999 if (PyUnicode_READY(self) == -1)
12000 return NULL;
12001 length = PyUnicode_GET_LENGTH(self);
12002 kind = PyUnicode_KIND(self);
12003 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (length == 1) {
12007 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12008 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012011 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012013 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 for (i = 0; i < length; i++) {
12016 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012019 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020}
12021
INADA Naoki3ae20562017-01-16 20:41:20 +090012022/*[clinic input]
12023str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
INADA Naoki3ae20562017-01-16 20:41:20 +090012025Return True if the string is a numeric string, False otherwise.
12026
12027A string is numeric if all characters in the string are numeric and there is at
12028least one character in the string.
12029[clinic start generated code]*/
12030
12031static PyObject *
12032unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012033/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 Py_ssize_t i, length;
12036 int kind;
12037 void *data;
12038
12039 if (PyUnicode_READY(self) == -1)
12040 return NULL;
12041 length = PyUnicode_GET_LENGTH(self);
12042 kind = PyUnicode_KIND(self);
12043 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 if (length == 1)
12047 return PyBool_FromLong(
12048 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012050 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012052 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 for (i = 0; i < length; i++) {
12055 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012058 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059}
12060
Martin v. Löwis47383402007-08-15 07:32:56 +000012061int
12062PyUnicode_IsIdentifier(PyObject *self)
12063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 int kind;
12065 void *data;
12066 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012067 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (PyUnicode_READY(self) == -1) {
12070 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 }
12073
12074 /* Special case for empty strings */
12075 if (PyUnicode_GET_LENGTH(self) == 0)
12076 return 0;
12077 kind = PyUnicode_KIND(self);
12078 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012079
12080 /* PEP 3131 says that the first character must be in
12081 XID_Start and subsequent characters in XID_Continue,
12082 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012084 letters, digits, underscore). However, given the current
12085 definition of XID_Start and XID_Continue, it is sufficient
12086 to check just for these, except that _ must be allowed
12087 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012089 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012090 return 0;
12091
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012092 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012095 return 1;
12096}
12097
INADA Naoki3ae20562017-01-16 20:41:20 +090012098/*[clinic input]
12099str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012100
INADA Naoki3ae20562017-01-16 20:41:20 +090012101Return True if the string is a valid Python identifier, False otherwise.
12102
12103Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12104"class".
12105[clinic start generated code]*/
12106
12107static PyObject *
12108unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012109/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012110{
12111 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12112}
12113
INADA Naoki3ae20562017-01-16 20:41:20 +090012114/*[clinic input]
12115str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012116
INADA Naoki3ae20562017-01-16 20:41:20 +090012117Return True if the string is printable, False otherwise.
12118
12119A string is printable if all of its characters are considered printable in
12120repr() or if it is empty.
12121[clinic start generated code]*/
12122
12123static PyObject *
12124unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012125/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 Py_ssize_t i, length;
12128 int kind;
12129 void *data;
12130
12131 if (PyUnicode_READY(self) == -1)
12132 return NULL;
12133 length = PyUnicode_GET_LENGTH(self);
12134 kind = PyUnicode_KIND(self);
12135 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012136
12137 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 if (length == 1)
12139 return PyBool_FromLong(
12140 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 for (i = 0; i < length; i++) {
12143 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012144 Py_RETURN_FALSE;
12145 }
12146 }
12147 Py_RETURN_TRUE;
12148}
12149
INADA Naoki3ae20562017-01-16 20:41:20 +090012150/*[clinic input]
12151str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
INADA Naoki3ae20562017-01-16 20:41:20 +090012153 iterable: object
12154 /
12155
12156Concatenate any number of strings.
12157
Martin Panter91a88662017-01-24 00:30:06 +000012158The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012159The result is returned as a new string.
12160
12161Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12162[clinic start generated code]*/
12163
12164static PyObject *
12165unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012166/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167{
INADA Naoki3ae20562017-01-16 20:41:20 +090012168 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169}
12170
Martin v. Löwis18e16552006-02-15 17:27:45 +000012171static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012172unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (PyUnicode_READY(self) == -1)
12175 return -1;
12176 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177}
12178
INADA Naoki3ae20562017-01-16 20:41:20 +090012179/*[clinic input]
12180str.ljust as unicode_ljust
12181
12182 width: Py_ssize_t
12183 fillchar: Py_UCS4 = ' '
12184 /
12185
12186Return a left-justified string of length width.
12187
12188Padding is done using the specified fill character (default is a space).
12189[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
12191static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012192unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12193/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012195 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Victor Stinnerc4b49542011-12-11 22:44:26 +010012198 if (PyUnicode_GET_LENGTH(self) >= width)
12199 return unicode_result_unchanged(self);
12200
12201 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202}
12203
INADA Naoki3ae20562017-01-16 20:41:20 +090012204/*[clinic input]
12205str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
INADA Naoki3ae20562017-01-16 20:41:20 +090012207Return a copy of the string converted to lowercase.
12208[clinic start generated code]*/
12209
12210static PyObject *
12211unicode_lower_impl(PyObject *self)
12212/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012214 if (PyUnicode_READY(self) == -1)
12215 return NULL;
12216 if (PyUnicode_IS_ASCII(self))
12217 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012218 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219}
12220
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012221#define LEFTSTRIP 0
12222#define RIGHTSTRIP 1
12223#define BOTHSTRIP 2
12224
12225/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012226static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012227
INADA Naoki3ae20562017-01-16 20:41:20 +090012228#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012229
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012230/* externally visible for str.strip(unicode) */
12231PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012232_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 void *data;
12235 int kind;
12236 Py_ssize_t i, j, len;
12237 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012238 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12241 return NULL;
12242
12243 kind = PyUnicode_KIND(self);
12244 data = PyUnicode_DATA(self);
12245 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012246 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12248 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012249 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012250
Benjamin Peterson14339b62009-01-31 16:36:08 +000012251 i = 0;
12252 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012253 while (i < len) {
12254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12255 if (!BLOOM(sepmask, ch))
12256 break;
12257 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12258 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 i++;
12260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012261 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012262
Benjamin Peterson14339b62009-01-31 16:36:08 +000012263 j = len;
12264 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012265 j--;
12266 while (j >= i) {
12267 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12268 if (!BLOOM(sepmask, ch))
12269 break;
12270 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12271 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012273 }
12274
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012276 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Victor Stinner7931d9a2011-11-04 00:22:48 +010012278 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279}
12280
12281PyObject*
12282PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12283{
12284 unsigned char *data;
12285 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012286 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287
Victor Stinnerde636f32011-10-01 03:55:54 +020012288 if (PyUnicode_READY(self) == -1)
12289 return NULL;
12290
Victor Stinner684d5fd2012-05-03 02:32:34 +020012291 length = PyUnicode_GET_LENGTH(self);
12292 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012293
Victor Stinner684d5fd2012-05-03 02:32:34 +020012294 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012295 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296
Victor Stinnerde636f32011-10-01 03:55:54 +020012297 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012298 PyErr_SetString(PyExc_IndexError, "string index out of range");
12299 return NULL;
12300 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012301 if (start >= length || end < start)
12302 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012303
Victor Stinner684d5fd2012-05-03 02:32:34 +020012304 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012305 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012306 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012307 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012308 }
12309 else {
12310 kind = PyUnicode_KIND(self);
12311 data = PyUnicode_1BYTE_DATA(self);
12312 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012313 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012314 length);
12315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317
12318static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012319do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 Py_ssize_t len, i, j;
12322
12323 if (PyUnicode_READY(self) == -1)
12324 return NULL;
12325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012327
Victor Stinnercc7af722013-04-09 22:39:24 +020012328 if (PyUnicode_IS_ASCII(self)) {
12329 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12330
12331 i = 0;
12332 if (striptype != RIGHTSTRIP) {
12333 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012334 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012335 if (!_Py_ascii_whitespace[ch])
12336 break;
12337 i++;
12338 }
12339 }
12340
12341 j = len;
12342 if (striptype != LEFTSTRIP) {
12343 j--;
12344 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012345 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012346 if (!_Py_ascii_whitespace[ch])
12347 break;
12348 j--;
12349 }
12350 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012351 }
12352 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012353 else {
12354 int kind = PyUnicode_KIND(self);
12355 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012356
Victor Stinnercc7af722013-04-09 22:39:24 +020012357 i = 0;
12358 if (striptype != RIGHTSTRIP) {
12359 while (i < len) {
12360 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12361 if (!Py_UNICODE_ISSPACE(ch))
12362 break;
12363 i++;
12364 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012365 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012366
12367 j = len;
12368 if (striptype != LEFTSTRIP) {
12369 j--;
12370 while (j >= i) {
12371 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12372 if (!Py_UNICODE_ISSPACE(ch))
12373 break;
12374 j--;
12375 }
12376 j++;
12377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012378 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012379
Victor Stinner7931d9a2011-11-04 00:22:48 +010012380 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381}
12382
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012383
12384static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012385do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012386{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387 if (sep != NULL && sep != Py_None) {
12388 if (PyUnicode_Check(sep))
12389 return _PyUnicode_XStrip(self, striptype, sep);
12390 else {
12391 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 "%s arg must be None or str",
12393 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 return NULL;
12395 }
12396 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012397
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012399}
12400
12401
INADA Naoki3ae20562017-01-16 20:41:20 +090012402/*[clinic input]
12403str.strip as unicode_strip
12404
12405 chars: object = None
12406 /
12407
Victor Stinner0c4a8282017-01-17 02:21:47 +010012408Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012409
12410If chars is given and not None, remove characters in chars instead.
12411[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012412
12413static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012414unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012415/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012416{
INADA Naoki3ae20562017-01-16 20:41:20 +090012417 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012418}
12419
12420
INADA Naoki3ae20562017-01-16 20:41:20 +090012421/*[clinic input]
12422str.lstrip as unicode_lstrip
12423
12424 chars: object = NULL
12425 /
12426
12427Return a copy of the string with leading whitespace removed.
12428
12429If chars is given and not None, remove characters in chars instead.
12430[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012431
12432static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012433unicode_lstrip_impl(PyObject *self, PyObject *chars)
12434/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435{
INADA Naoki3ae20562017-01-16 20:41:20 +090012436 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012437}
12438
12439
INADA Naoki3ae20562017-01-16 20:41:20 +090012440/*[clinic input]
12441str.rstrip as unicode_rstrip
12442
12443 chars: object = NULL
12444 /
12445
12446Return a copy of the string with trailing whitespace removed.
12447
12448If chars is given and not None, remove characters in chars instead.
12449[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012450
12451static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012452unicode_rstrip_impl(PyObject *self, PyObject *chars)
12453/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454{
INADA Naoki3ae20562017-01-16 20:41:20 +090012455 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456}
12457
12458
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012460unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012462 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464
Serhiy Storchaka05997252013-01-26 12:14:02 +020012465 if (len < 1)
12466 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467
Victor Stinnerc4b49542011-12-11 22:44:26 +010012468 /* no repeat, return original string */
12469 if (len == 1)
12470 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012471
Benjamin Petersonbac79492012-01-14 13:34:47 -050012472 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 return NULL;
12474
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012475 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012476 PyErr_SetString(PyExc_OverflowError,
12477 "repeated string is too long");
12478 return NULL;
12479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012481
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012482 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483 if (!u)
12484 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012485 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (PyUnicode_GET_LENGTH(str) == 1) {
12488 const int kind = PyUnicode_KIND(str);
12489 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012490 if (kind == PyUnicode_1BYTE_KIND) {
12491 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012492 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012493 }
12494 else if (kind == PyUnicode_2BYTE_KIND) {
12495 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012496 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012497 ucs2[n] = fill_char;
12498 } else {
12499 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12500 assert(kind == PyUnicode_4BYTE_KIND);
12501 for (n = 0; n < len; ++n)
12502 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 }
12505 else {
12506 /* number of characters copied this far */
12507 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012508 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012510 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012514 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012515 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517 }
12518
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012519 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012520 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521}
12522
Alexander Belopolsky40018472011-02-26 01:02:56 +000012523PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012524PyUnicode_Replace(PyObject *str,
12525 PyObject *substr,
12526 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012527 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012529 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12530 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012532 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533}
12534
INADA Naoki3ae20562017-01-16 20:41:20 +090012535/*[clinic input]
12536str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
INADA Naoki3ae20562017-01-16 20:41:20 +090012538 old: unicode
12539 new: unicode
12540 count: Py_ssize_t = -1
12541 Maximum number of occurrences to replace.
12542 -1 (the default value) means replace all occurrences.
12543 /
12544
12545Return a copy with all occurrences of substring old replaced by new.
12546
12547If the optional argument count is given, only the first count occurrences are
12548replaced.
12549[clinic start generated code]*/
12550
12551static PyObject *
12552unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12553 Py_ssize_t count)
12554/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012556 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012558 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559}
12560
Alexander Belopolsky40018472011-02-26 01:02:56 +000012561static PyObject *
12562unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012564 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 Py_ssize_t isize;
12566 Py_ssize_t osize, squote, dquote, i, o;
12567 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012568 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012572 return NULL;
12573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 isize = PyUnicode_GET_LENGTH(unicode);
12575 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 /* Compute length of output, quote characters, and
12578 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012579 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 max = 127;
12581 squote = dquote = 0;
12582 ikind = PyUnicode_KIND(unicode);
12583 for (i = 0; i < isize; i++) {
12584 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012585 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012587 case '\'': squote++; break;
12588 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012590 incr = 2;
12591 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 default:
12593 /* Fast-path ASCII */
12594 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012595 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012597 ;
12598 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012601 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012603 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012605 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012607 if (osize > PY_SSIZE_T_MAX - incr) {
12608 PyErr_SetString(PyExc_OverflowError,
12609 "string is too long to generate repr");
12610 return NULL;
12611 }
12612 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 }
12614
12615 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012616 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012618 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 if (dquote)
12620 /* Both squote and dquote present. Use squote,
12621 and escape them */
12622 osize += squote;
12623 else
12624 quote = '"';
12625 }
Victor Stinner55c08782013-04-14 18:45:39 +020012626 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627
12628 repr = PyUnicode_New(osize, max);
12629 if (repr == NULL)
12630 return NULL;
12631 okind = PyUnicode_KIND(repr);
12632 odata = PyUnicode_DATA(repr);
12633
12634 PyUnicode_WRITE(okind, odata, 0, quote);
12635 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012636 if (unchanged) {
12637 _PyUnicode_FastCopyCharacters(repr, 1,
12638 unicode, 0,
12639 isize);
12640 }
12641 else {
12642 for (i = 0, o = 1; i < isize; i++) {
12643 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644
Victor Stinner55c08782013-04-14 18:45:39 +020012645 /* Escape quotes and backslashes */
12646 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012647 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012649 continue;
12650 }
12651
12652 /* Map special whitespace to '\t', \n', '\r' */
12653 if (ch == '\t') {
12654 PyUnicode_WRITE(okind, odata, o++, '\\');
12655 PyUnicode_WRITE(okind, odata, o++, 't');
12656 }
12657 else if (ch == '\n') {
12658 PyUnicode_WRITE(okind, odata, o++, '\\');
12659 PyUnicode_WRITE(okind, odata, o++, 'n');
12660 }
12661 else if (ch == '\r') {
12662 PyUnicode_WRITE(okind, odata, o++, '\\');
12663 PyUnicode_WRITE(okind, odata, o++, 'r');
12664 }
12665
12666 /* Map non-printable US ASCII to '\xhh' */
12667 else if (ch < ' ' || ch == 0x7F) {
12668 PyUnicode_WRITE(okind, odata, o++, '\\');
12669 PyUnicode_WRITE(okind, odata, o++, 'x');
12670 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12671 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12672 }
12673
12674 /* Copy ASCII characters as-is */
12675 else if (ch < 0x7F) {
12676 PyUnicode_WRITE(okind, odata, o++, ch);
12677 }
12678
12679 /* Non-ASCII characters */
12680 else {
12681 /* Map Unicode whitespace and control characters
12682 (categories Z* and C* except ASCII space)
12683 */
12684 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12685 PyUnicode_WRITE(okind, odata, o++, '\\');
12686 /* Map 8-bit characters to '\xhh' */
12687 if (ch <= 0xff) {
12688 PyUnicode_WRITE(okind, odata, o++, 'x');
12689 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12690 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12691 }
12692 /* Map 16-bit characters to '\uxxxx' */
12693 else if (ch <= 0xffff) {
12694 PyUnicode_WRITE(okind, odata, o++, 'u');
12695 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12696 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12697 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12698 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12699 }
12700 /* Map 21-bit characters to '\U00xxxxxx' */
12701 else {
12702 PyUnicode_WRITE(okind, odata, o++, 'U');
12703 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12704 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12705 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12706 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12711 }
12712 }
12713 /* Copy characters as-is */
12714 else {
12715 PyUnicode_WRITE(okind, odata, o++, ch);
12716 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012717 }
12718 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012721 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012722 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723}
12724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012725PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727\n\
12728Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012729such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730arguments start and end are interpreted as in slice notation.\n\
12731\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012732Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733
12734static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012737 /* initialize variables to prevent gcc warning */
12738 PyObject *substring = NULL;
12739 Py_ssize_t start = 0;
12740 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012741 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012743 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012746 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012749 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 if (result == -2)
12752 return NULL;
12753
Christian Heimes217cfd12007-12-02 14:31:20 +000012754 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755}
12756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012757PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012760Return the highest index in S where substring sub is found,\n\
12761such that sub is contained within S[start:end]. Optional\n\
12762arguments start and end are interpreted as in slice notation.\n\
12763\n\
12764Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765
12766static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012769 /* initialize variables to prevent gcc warning */
12770 PyObject *substring = NULL;
12771 Py_ssize_t start = 0;
12772 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012773 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012775 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012778 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012781 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 if (result == -2)
12784 return NULL;
12785
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786 if (result < 0) {
12787 PyErr_SetString(PyExc_ValueError, "substring not found");
12788 return NULL;
12789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790
Christian Heimes217cfd12007-12-02 14:31:20 +000012791 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792}
12793
INADA Naoki3ae20562017-01-16 20:41:20 +090012794/*[clinic input]
12795str.rjust as unicode_rjust
12796
12797 width: Py_ssize_t
12798 fillchar: Py_UCS4 = ' '
12799 /
12800
12801Return a right-justified string of length width.
12802
12803Padding is done using the specified fill character (default is a space).
12804[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
12806static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012807unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12808/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012810 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811 return NULL;
12812
Victor Stinnerc4b49542011-12-11 22:44:26 +010012813 if (PyUnicode_GET_LENGTH(self) >= width)
12814 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
Victor Stinnerc4b49542011-12-11 22:44:26 +010012816 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817}
12818
Alexander Belopolsky40018472011-02-26 01:02:56 +000012819PyObject *
12820PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012825 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826}
12827
INADA Naoki3ae20562017-01-16 20:41:20 +090012828/*[clinic input]
12829str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830
INADA Naoki3ae20562017-01-16 20:41:20 +090012831 sep: object = None
12832 The delimiter according which to split the string.
12833 None (the default value) means split according to any whitespace,
12834 and discard empty strings from the result.
12835 maxsplit: Py_ssize_t = -1
12836 Maximum number of splits to do.
12837 -1 (the default value) means no limit.
12838
12839Return a list of the words in the string, using sep as the delimiter string.
12840[clinic start generated code]*/
12841
12842static PyObject *
12843unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12844/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
INADA Naoki3ae20562017-01-16 20:41:20 +090012846 if (sep == Py_None)
12847 return split(self, NULL, maxsplit);
12848 if (PyUnicode_Check(sep))
12849 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012850
Victor Stinner998b8062018-09-12 00:23:25 +020012851 PyErr_Format(PyExc_TypeError,
12852 "must be str or None, not %.100s",
12853 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855}
12856
Thomas Wouters477c8d52006-05-27 19:21:47 +000012857PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012858PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012859{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012861 int kind1, kind2;
12862 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012864
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012865 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867
Victor Stinner14f8f022011-10-05 20:58:25 +020012868 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 len1 = PyUnicode_GET_LENGTH(str_obj);
12871 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012872 if (kind1 < kind2 || len1 < len2) {
12873 _Py_INCREF_UNICODE_EMPTY();
12874 if (!unicode_empty)
12875 out = NULL;
12876 else {
12877 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12878 Py_DECREF(unicode_empty);
12879 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012880 return out;
12881 }
12882 buf1 = PyUnicode_DATA(str_obj);
12883 buf2 = PyUnicode_DATA(sep_obj);
12884 if (kind2 != kind1) {
12885 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12886 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012887 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012890 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012892 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12893 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12894 else
12895 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 break;
12897 case PyUnicode_2BYTE_KIND:
12898 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12899 break;
12900 case PyUnicode_4BYTE_KIND:
12901 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12902 break;
12903 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012904 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012907 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909
12910 return out;
12911}
12912
12913
12914PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012915PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012916{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012917 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012918 int kind1, kind2;
12919 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012922 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012925 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 len1 = PyUnicode_GET_LENGTH(str_obj);
12928 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 if (kind1 < kind2 || len1 < len2) {
12930 _Py_INCREF_UNICODE_EMPTY();
12931 if (!unicode_empty)
12932 out = NULL;
12933 else {
12934 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12935 Py_DECREF(unicode_empty);
12936 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012937 return out;
12938 }
12939 buf1 = PyUnicode_DATA(str_obj);
12940 buf2 = PyUnicode_DATA(sep_obj);
12941 if (kind2 != kind1) {
12942 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12943 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012944 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012947 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012949 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12950 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12951 else
12952 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 break;
12954 case PyUnicode_2BYTE_KIND:
12955 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12956 break;
12957 case PyUnicode_4BYTE_KIND:
12958 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 break;
12960 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012961 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012963
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012964 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
12967 return out;
12968}
12969
INADA Naoki3ae20562017-01-16 20:41:20 +090012970/*[clinic input]
12971str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972
INADA Naoki3ae20562017-01-16 20:41:20 +090012973 sep: object
12974 /
12975
12976Partition the string into three parts using the given separator.
12977
12978This will search for the separator in the string. If the separator is found,
12979returns a 3-tuple containing the part before the separator, the separator
12980itself, and the part after it.
12981
12982If the separator is not found, returns a 3-tuple containing the original string
12983and two empty strings.
12984[clinic start generated code]*/
12985
12986static PyObject *
12987unicode_partition(PyObject *self, PyObject *sep)
12988/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989{
INADA Naoki3ae20562017-01-16 20:41:20 +090012990 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991}
12992
INADA Naoki3ae20562017-01-16 20:41:20 +090012993/*[clinic input]
12994str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995
INADA Naoki3ae20562017-01-16 20:41:20 +090012996Partition the string into three parts using the given separator.
12997
Serhiy Storchakaa2314282017-10-29 02:11:54 +030012998This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090012999the separator is found, returns a 3-tuple containing the part before the
13000separator, the separator itself, and the part after it.
13001
13002If the separator is not found, returns a 3-tuple containing two empty strings
13003and the original string.
13004[clinic start generated code]*/
13005
13006static PyObject *
13007unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013008/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009{
INADA Naoki3ae20562017-01-16 20:41:20 +090013010 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013011}
13012
Alexander Belopolsky40018472011-02-26 01:02:56 +000013013PyObject *
13014PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013015{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013016 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013017 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013018
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013019 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013020}
13021
INADA Naoki3ae20562017-01-16 20:41:20 +090013022/*[clinic input]
13023str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013024
INADA Naoki3ae20562017-01-16 20:41:20 +090013025Return a list of the words in the string, using sep as the delimiter string.
13026
13027Splits are done starting at the end of the string and working to the front.
13028[clinic start generated code]*/
13029
13030static PyObject *
13031unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13032/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013033{
INADA Naoki3ae20562017-01-16 20:41:20 +090013034 if (sep == Py_None)
13035 return rsplit(self, NULL, maxsplit);
13036 if (PyUnicode_Check(sep))
13037 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013038
Victor Stinner998b8062018-09-12 00:23:25 +020013039 PyErr_Format(PyExc_TypeError,
13040 "must be str or None, not %.100s",
13041 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013042 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013043}
13044
INADA Naoki3ae20562017-01-16 20:41:20 +090013045/*[clinic input]
13046str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013048 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013049
13050Return a list of the lines in the string, breaking at line boundaries.
13051
13052Line breaks are not included in the resulting list unless keepends is given and
13053true.
13054[clinic start generated code]*/
13055
13056static PyObject *
13057unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013058/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013060 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061}
13062
13063static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013064PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013066 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067}
13068
INADA Naoki3ae20562017-01-16 20:41:20 +090013069/*[clinic input]
13070str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071
INADA Naoki3ae20562017-01-16 20:41:20 +090013072Convert uppercase characters to lowercase and lowercase characters to uppercase.
13073[clinic start generated code]*/
13074
13075static PyObject *
13076unicode_swapcase_impl(PyObject *self)
13077/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013079 if (PyUnicode_READY(self) == -1)
13080 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013081 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082}
13083
Larry Hastings61272b72014-01-07 12:41:53 -080013084/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013085
Larry Hastings31826802013-10-19 00:09:25 -070013086@staticmethod
13087str.maketrans as unicode_maketrans
13088
13089 x: object
13090
13091 y: unicode=NULL
13092
13093 z: unicode=NULL
13094
13095 /
13096
13097Return a translation table usable for str.translate().
13098
13099If there is only one argument, it must be a dictionary mapping Unicode
13100ordinals (integers) or characters to Unicode ordinals, strings or None.
13101Character keys will be then converted to ordinals.
13102If there are two arguments, they must be strings of equal length, and
13103in the resulting dictionary, each character in x will be mapped to the
13104character at the same position in y. If there is a third argument, it
13105must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013106[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013107
Larry Hastings31826802013-10-19 00:09:25 -070013108static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013109unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013110/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013111{
Georg Brandlceee0772007-11-27 23:48:05 +000013112 PyObject *new = NULL, *key, *value;
13113 Py_ssize_t i = 0;
13114 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013115
Georg Brandlceee0772007-11-27 23:48:05 +000013116 new = PyDict_New();
13117 if (!new)
13118 return NULL;
13119 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 int x_kind, y_kind, z_kind;
13121 void *x_data, *y_data, *z_data;
13122
Georg Brandlceee0772007-11-27 23:48:05 +000013123 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013124 if (!PyUnicode_Check(x)) {
13125 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13126 "be a string if there is a second argument");
13127 goto err;
13128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013130 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13131 "arguments must have equal length");
13132 goto err;
13133 }
13134 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135 x_kind = PyUnicode_KIND(x);
13136 y_kind = PyUnicode_KIND(y);
13137 x_data = PyUnicode_DATA(x);
13138 y_data = PyUnicode_DATA(y);
13139 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13140 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013141 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013142 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013143 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013144 if (!value) {
13145 Py_DECREF(key);
13146 goto err;
13147 }
Georg Brandlceee0772007-11-27 23:48:05 +000013148 res = PyDict_SetItem(new, key, value);
13149 Py_DECREF(key);
13150 Py_DECREF(value);
13151 if (res < 0)
13152 goto err;
13153 }
13154 /* create entries for deleting chars in z */
13155 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 z_kind = PyUnicode_KIND(z);
13157 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013158 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013160 if (!key)
13161 goto err;
13162 res = PyDict_SetItem(new, key, Py_None);
13163 Py_DECREF(key);
13164 if (res < 0)
13165 goto err;
13166 }
13167 }
13168 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 int kind;
13170 void *data;
13171
Georg Brandlceee0772007-11-27 23:48:05 +000013172 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013173 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013174 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13175 "to maketrans it must be a dict");
13176 goto err;
13177 }
13178 /* copy entries into the new dict, converting string keys to int keys */
13179 while (PyDict_Next(x, &i, &key, &value)) {
13180 if (PyUnicode_Check(key)) {
13181 /* convert string keys to integer keys */
13182 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013183 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013184 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13185 "table must be of length 1");
13186 goto err;
13187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 kind = PyUnicode_KIND(key);
13189 data = PyUnicode_DATA(key);
13190 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013191 if (!newkey)
13192 goto err;
13193 res = PyDict_SetItem(new, newkey, value);
13194 Py_DECREF(newkey);
13195 if (res < 0)
13196 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013197 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013198 /* just keep integer keys */
13199 if (PyDict_SetItem(new, key, value) < 0)
13200 goto err;
13201 } else {
13202 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13203 "be strings or integers");
13204 goto err;
13205 }
13206 }
13207 }
13208 return new;
13209 err:
13210 Py_DECREF(new);
13211 return NULL;
13212}
13213
INADA Naoki3ae20562017-01-16 20:41:20 +090013214/*[clinic input]
13215str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216
INADA Naoki3ae20562017-01-16 20:41:20 +090013217 table: object
13218 Translation table, which must be a mapping of Unicode ordinals to
13219 Unicode ordinals, strings, or None.
13220 /
13221
13222Replace each character in the string using the given translation table.
13223
13224The table must implement lookup/indexing via __getitem__, for instance a
13225dictionary or list. If this operation raises LookupError, the character is
13226left untouched. Characters mapped to None are deleted.
13227[clinic start generated code]*/
13228
13229static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013231/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234}
13235
INADA Naoki3ae20562017-01-16 20:41:20 +090013236/*[clinic input]
13237str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238
INADA Naoki3ae20562017-01-16 20:41:20 +090013239Return a copy of the string converted to uppercase.
13240[clinic start generated code]*/
13241
13242static PyObject *
13243unicode_upper_impl(PyObject *self)
13244/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013246 if (PyUnicode_READY(self) == -1)
13247 return NULL;
13248 if (PyUnicode_IS_ASCII(self))
13249 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013250 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251}
13252
INADA Naoki3ae20562017-01-16 20:41:20 +090013253/*[clinic input]
13254str.zfill as unicode_zfill
13255
13256 width: Py_ssize_t
13257 /
13258
13259Pad a numeric string with zeros on the left, to fill a field of the given width.
13260
13261The string is never truncated.
13262[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263
13264static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013265unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013266/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013268 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013269 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 int kind;
13271 void *data;
13272 Py_UCS4 chr;
13273
Benjamin Petersonbac79492012-01-14 13:34:47 -050013274 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276
Victor Stinnerc4b49542011-12-11 22:44:26 +010013277 if (PyUnicode_GET_LENGTH(self) >= width)
13278 return unicode_result_unchanged(self);
13279
13280 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281
13282 u = pad(self, fill, 0, '0');
13283
Walter Dörwald068325e2002-04-15 13:36:47 +000013284 if (u == NULL)
13285 return NULL;
13286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 kind = PyUnicode_KIND(u);
13288 data = PyUnicode_DATA(u);
13289 chr = PyUnicode_READ(kind, data, fill);
13290
13291 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 PyUnicode_WRITE(kind, data, 0, chr);
13294 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295 }
13296
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013297 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013298 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
13301#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013302static PyObject *
13303unicode__decimal2ascii(PyObject *self)
13304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013306}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307#endif
13308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013309PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013312Return True if S starts with the specified prefix, False otherwise.\n\
13313With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013314With optional end, stop comparing S at that position.\n\
13315prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316
13317static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013318unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013321 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013322 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013323 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013324 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013325 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
Jesus Ceaac451502011-04-20 17:09:23 +020013327 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013329 if (PyTuple_Check(subobj)) {
13330 Py_ssize_t i;
13331 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013332 substring = PyTuple_GET_ITEM(subobj, i);
13333 if (!PyUnicode_Check(substring)) {
13334 PyErr_Format(PyExc_TypeError,
13335 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013336 "not %.100s",
13337 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013338 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013339 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013340 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013341 if (result == -1)
13342 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013343 if (result) {
13344 Py_RETURN_TRUE;
13345 }
13346 }
13347 /* nothing matched */
13348 Py_RETURN_FALSE;
13349 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013350 if (!PyUnicode_Check(subobj)) {
13351 PyErr_Format(PyExc_TypeError,
13352 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013353 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013355 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013356 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013357 if (result == -1)
13358 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013359 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360}
13361
13362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013363PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013366Return True if S ends with the specified suffix, False otherwise.\n\
13367With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013368With optional end, stop comparing S at that position.\n\
13369suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370
13371static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013372unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013375 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013376 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013377 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013378 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380
Jesus Ceaac451502011-04-20 17:09:23 +020013381 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 if (PyTuple_Check(subobj)) {
13384 Py_ssize_t i;
13385 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013386 substring = PyTuple_GET_ITEM(subobj, i);
13387 if (!PyUnicode_Check(substring)) {
13388 PyErr_Format(PyExc_TypeError,
13389 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013390 "not %.100s",
13391 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013393 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013394 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013395 if (result == -1)
13396 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 if (result) {
13398 Py_RETURN_TRUE;
13399 }
13400 }
13401 Py_RETURN_FALSE;
13402 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 if (!PyUnicode_Check(subobj)) {
13404 PyErr_Format(PyExc_TypeError,
13405 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013406 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013408 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013410 if (result == -1)
13411 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413}
13414
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013415static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013416_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013417{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013418 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13419 writer->data = PyUnicode_DATA(writer->buffer);
13420
13421 if (!writer->readonly) {
13422 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013423 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013424 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013425 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013426 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13427 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13428 writer->kind = PyUnicode_WCHAR_KIND;
13429 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13430
Victor Stinner8f674cc2013-04-17 23:02:17 +020013431 /* Copy-on-write mode: set buffer size to 0 so
13432 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13433 * next write. */
13434 writer->size = 0;
13435 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013436}
13437
Victor Stinnerd3f08822012-05-29 12:57:52 +020013438void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013439_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013440{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013441 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013442
13443 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013444 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013445
13446 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13447 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13448 writer->kind = PyUnicode_WCHAR_KIND;
13449 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013450}
13451
Victor Stinnerd3f08822012-05-29 12:57:52 +020013452int
13453_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13454 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013455{
13456 Py_ssize_t newlen;
13457 PyObject *newbuffer;
13458
Victor Stinner2740e462016-09-06 16:58:36 -070013459 assert(maxchar <= MAX_UNICODE);
13460
Victor Stinnerca9381e2015-09-22 00:58:32 +020013461 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013462 assert((maxchar > writer->maxchar && length >= 0)
13463 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013464
Victor Stinner202fdca2012-05-07 12:47:02 +020013465 if (length > PY_SSIZE_T_MAX - writer->pos) {
13466 PyErr_NoMemory();
13467 return -1;
13468 }
13469 newlen = writer->pos + length;
13470
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013471 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013472
Victor Stinnerd3f08822012-05-29 12:57:52 +020013473 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013474 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013475 if (writer->overallocate
13476 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13477 /* overallocate to limit the number of realloc() */
13478 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013479 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013480 if (newlen < writer->min_length)
13481 newlen = writer->min_length;
13482
Victor Stinnerd3f08822012-05-29 12:57:52 +020013483 writer->buffer = PyUnicode_New(newlen, maxchar);
13484 if (writer->buffer == NULL)
13485 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013486 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013487 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013488 if (writer->overallocate
13489 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13490 /* overallocate to limit the number of realloc() */
13491 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013492 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013493 if (newlen < writer->min_length)
13494 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013495
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013496 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013497 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013498 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013499 newbuffer = PyUnicode_New(newlen, maxchar);
13500 if (newbuffer == NULL)
13501 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013502 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13503 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013504 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013505 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 }
13507 else {
13508 newbuffer = resize_compact(writer->buffer, newlen);
13509 if (newbuffer == NULL)
13510 return -1;
13511 }
13512 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013513 }
13514 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013515 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013516 newbuffer = PyUnicode_New(writer->size, maxchar);
13517 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013518 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13520 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013521 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013522 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013523 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013524 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013525
13526#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013527}
13528
Victor Stinnerca9381e2015-09-22 00:58:32 +020013529int
13530_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13531 enum PyUnicode_Kind kind)
13532{
13533 Py_UCS4 maxchar;
13534
13535 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13536 assert(writer->kind < kind);
13537
13538 switch (kind)
13539 {
13540 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13541 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13542 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13543 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013544 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013545 }
13546
13547 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13548}
13549
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013550static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013551_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013552{
Victor Stinner2740e462016-09-06 16:58:36 -070013553 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013554 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13555 return -1;
13556 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13557 writer->pos++;
13558 return 0;
13559}
13560
13561int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013562_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13563{
13564 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13565}
13566
13567int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13569{
13570 Py_UCS4 maxchar;
13571 Py_ssize_t len;
13572
13573 if (PyUnicode_READY(str) == -1)
13574 return -1;
13575 len = PyUnicode_GET_LENGTH(str);
13576 if (len == 0)
13577 return 0;
13578 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13579 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013580 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013581 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013582 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013583 Py_INCREF(str);
13584 writer->buffer = str;
13585 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013586 writer->pos += len;
13587 return 0;
13588 }
13589 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13590 return -1;
13591 }
13592 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13593 str, 0, len);
13594 writer->pos += len;
13595 return 0;
13596}
13597
Victor Stinnere215d962012-10-06 23:03:36 +020013598int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013599_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13600 Py_ssize_t start, Py_ssize_t end)
13601{
13602 Py_UCS4 maxchar;
13603 Py_ssize_t len;
13604
13605 if (PyUnicode_READY(str) == -1)
13606 return -1;
13607
13608 assert(0 <= start);
13609 assert(end <= PyUnicode_GET_LENGTH(str));
13610 assert(start <= end);
13611
13612 if (end == 0)
13613 return 0;
13614
13615 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13616 return _PyUnicodeWriter_WriteStr(writer, str);
13617
13618 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13619 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13620 else
13621 maxchar = writer->maxchar;
13622 len = end - start;
13623
13624 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13625 return -1;
13626
13627 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13628 str, start, len);
13629 writer->pos += len;
13630 return 0;
13631}
13632
13633int
Victor Stinner4a587072013-11-19 12:54:53 +010013634_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13635 const char *ascii, Py_ssize_t len)
13636{
13637 if (len == -1)
13638 len = strlen(ascii);
13639
13640 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13641
13642 if (writer->buffer == NULL && !writer->overallocate) {
13643 PyObject *str;
13644
13645 str = _PyUnicode_FromASCII(ascii, len);
13646 if (str == NULL)
13647 return -1;
13648
13649 writer->readonly = 1;
13650 writer->buffer = str;
13651 _PyUnicodeWriter_Update(writer);
13652 writer->pos += len;
13653 return 0;
13654 }
13655
13656 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13657 return -1;
13658
13659 switch (writer->kind)
13660 {
13661 case PyUnicode_1BYTE_KIND:
13662 {
13663 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13664 Py_UCS1 *data = writer->data;
13665
Christian Heimesf051e432016-09-13 20:22:02 +020013666 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013667 break;
13668 }
13669 case PyUnicode_2BYTE_KIND:
13670 {
13671 _PyUnicode_CONVERT_BYTES(
13672 Py_UCS1, Py_UCS2,
13673 ascii, ascii + len,
13674 (Py_UCS2 *)writer->data + writer->pos);
13675 break;
13676 }
13677 case PyUnicode_4BYTE_KIND:
13678 {
13679 _PyUnicode_CONVERT_BYTES(
13680 Py_UCS1, Py_UCS4,
13681 ascii, ascii + len,
13682 (Py_UCS4 *)writer->data + writer->pos);
13683 break;
13684 }
13685 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013686 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013687 }
13688
13689 writer->pos += len;
13690 return 0;
13691}
13692
13693int
13694_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13695 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013696{
13697 Py_UCS4 maxchar;
13698
13699 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13700 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13701 return -1;
13702 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13703 writer->pos += len;
13704 return 0;
13705}
13706
Victor Stinnerd3f08822012-05-29 12:57:52 +020013707PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013708_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013709{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013710 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013711
Victor Stinnerd3f08822012-05-29 12:57:52 +020013712 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013713 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013714 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013715 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013716
13717 str = writer->buffer;
13718 writer->buffer = NULL;
13719
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013720 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013721 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13722 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013723 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013724
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013725 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13726 PyObject *str2;
13727 str2 = resize_compact(str, writer->pos);
13728 if (str2 == NULL) {
13729 Py_DECREF(str);
13730 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013731 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013732 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013733 }
13734
Victor Stinner15a0bd32013-07-08 22:29:55 +020013735 assert(_PyUnicode_CheckConsistency(str, 1));
13736 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013737}
13738
Victor Stinnerd3f08822012-05-29 12:57:52 +020013739void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013740_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013741{
13742 Py_CLEAR(writer->buffer);
13743}
13744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013746
13747PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013749\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013750Return a formatted version of S, using substitutions from args and kwargs.\n\
13751The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013752
Eric Smith27bbca62010-11-04 17:06:58 +000013753PyDoc_STRVAR(format_map__doc__,
13754 "S.format_map(mapping) -> str\n\
13755\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013756Return a formatted version of S, using substitutions from mapping.\n\
13757The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013758
INADA Naoki3ae20562017-01-16 20:41:20 +090013759/*[clinic input]
13760str.__format__ as unicode___format__
13761
13762 format_spec: unicode
13763 /
13764
13765Return a formatted version of the string as described by format_spec.
13766[clinic start generated code]*/
13767
Eric Smith4a7d76d2008-05-30 18:10:19 +000013768static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013769unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013770/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013771{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013772 _PyUnicodeWriter writer;
13773 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013774
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775 if (PyUnicode_READY(self) == -1)
13776 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013777 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13779 self, format_spec, 0,
13780 PyUnicode_GET_LENGTH(format_spec));
13781 if (ret == -1) {
13782 _PyUnicodeWriter_Dealloc(&writer);
13783 return NULL;
13784 }
13785 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013786}
13787
INADA Naoki3ae20562017-01-16 20:41:20 +090013788/*[clinic input]
13789str.__sizeof__ as unicode_sizeof
13790
13791Return the size of the string in memory, in bytes.
13792[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013793
13794static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013795unicode_sizeof_impl(PyObject *self)
13796/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 Py_ssize_t size;
13799
13800 /* If it's a compact object, account for base structure +
13801 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013802 if (PyUnicode_IS_COMPACT_ASCII(self))
13803 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13804 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013806 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013807 else {
13808 /* If it is a two-block object, account for base object, and
13809 for character block if present. */
13810 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013811 if (_PyUnicode_DATA_ANY(self))
13812 size += (PyUnicode_GET_LENGTH(self) + 1) *
13813 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013814 }
13815 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013816 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013817 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13818 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13819 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13820 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013821
13822 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013823}
13824
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013825static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013826unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013827{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013828 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 if (!copy)
13830 return NULL;
13831 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013832}
13833
Guido van Rossumd57fd912000-03-10 22:53:23 +000013834static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013835 UNICODE_ENCODE_METHODDEF
13836 UNICODE_REPLACE_METHODDEF
13837 UNICODE_SPLIT_METHODDEF
13838 UNICODE_RSPLIT_METHODDEF
13839 UNICODE_JOIN_METHODDEF
13840 UNICODE_CAPITALIZE_METHODDEF
13841 UNICODE_CASEFOLD_METHODDEF
13842 UNICODE_TITLE_METHODDEF
13843 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013844 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013845 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013846 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013847 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013848 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013849 UNICODE_LJUST_METHODDEF
13850 UNICODE_LOWER_METHODDEF
13851 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013852 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13853 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013854 UNICODE_RJUST_METHODDEF
13855 UNICODE_RSTRIP_METHODDEF
13856 UNICODE_RPARTITION_METHODDEF
13857 UNICODE_SPLITLINES_METHODDEF
13858 UNICODE_STRIP_METHODDEF
13859 UNICODE_SWAPCASE_METHODDEF
13860 UNICODE_TRANSLATE_METHODDEF
13861 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013862 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13863 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013864 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 UNICODE_ISLOWER_METHODDEF
13866 UNICODE_ISUPPER_METHODDEF
13867 UNICODE_ISTITLE_METHODDEF
13868 UNICODE_ISSPACE_METHODDEF
13869 UNICODE_ISDECIMAL_METHODDEF
13870 UNICODE_ISDIGIT_METHODDEF
13871 UNICODE_ISNUMERIC_METHODDEF
13872 UNICODE_ISALPHA_METHODDEF
13873 UNICODE_ISALNUM_METHODDEF
13874 UNICODE_ISIDENTIFIER_METHODDEF
13875 UNICODE_ISPRINTABLE_METHODDEF
13876 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013877 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013878 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013879 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013880 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013881 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013882#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013883 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013884 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885#endif
13886
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013887 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888 {NULL, NULL}
13889};
13890
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013891static PyObject *
13892unicode_mod(PyObject *v, PyObject *w)
13893{
Brian Curtindfc80e32011-08-10 20:28:54 -050013894 if (!PyUnicode_Check(v))
13895 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013897}
13898
13899static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013900 0, /*nb_add*/
13901 0, /*nb_subtract*/
13902 0, /*nb_multiply*/
13903 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013904};
13905
Guido van Rossumd57fd912000-03-10 22:53:23 +000013906static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013907 (lenfunc) unicode_length, /* sq_length */
13908 PyUnicode_Concat, /* sq_concat */
13909 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13910 (ssizeargfunc) unicode_getitem, /* sq_item */
13911 0, /* sq_slice */
13912 0, /* sq_ass_item */
13913 0, /* sq_ass_slice */
13914 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915};
13916
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013917static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013918unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013920 if (PyUnicode_READY(self) == -1)
13921 return NULL;
13922
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013923 if (PyIndex_Check(item)) {
13924 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013925 if (i == -1 && PyErr_Occurred())
13926 return NULL;
13927 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013928 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013929 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013930 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013931 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013932 PyObject *result;
13933 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013934 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013935 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013936
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013937 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013938 return NULL;
13939 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013940 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13941 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013942
13943 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013944 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013946 slicelength == PyUnicode_GET_LENGTH(self)) {
13947 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013948 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013949 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013950 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013951 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013952 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013953 src_kind = PyUnicode_KIND(self);
13954 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013955 if (!PyUnicode_IS_ASCII(self)) {
13956 kind_limit = kind_maxchar_limit(src_kind);
13957 max_char = 0;
13958 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13959 ch = PyUnicode_READ(src_kind, src_data, cur);
13960 if (ch > max_char) {
13961 max_char = ch;
13962 if (max_char >= kind_limit)
13963 break;
13964 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013965 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013966 }
Victor Stinner55c99112011-10-13 01:17:06 +020013967 else
13968 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013969 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013970 if (result == NULL)
13971 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013972 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013973 dest_data = PyUnicode_DATA(result);
13974
13975 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013976 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13977 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013978 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013979 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013980 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013981 } else {
13982 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13983 return NULL;
13984 }
13985}
13986
13987static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013988 (lenfunc)unicode_length, /* mp_length */
13989 (binaryfunc)unicode_subscript, /* mp_subscript */
13990 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013991};
13992
Guido van Rossumd57fd912000-03-10 22:53:23 +000013993
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994/* Helpers for PyUnicode_Format() */
13995
Victor Stinnera47082312012-10-04 02:19:54 +020013996struct unicode_formatter_t {
13997 PyObject *args;
13998 int args_owned;
13999 Py_ssize_t arglen, argidx;
14000 PyObject *dict;
14001
14002 enum PyUnicode_Kind fmtkind;
14003 Py_ssize_t fmtcnt, fmtpos;
14004 void *fmtdata;
14005 PyObject *fmtstr;
14006
14007 _PyUnicodeWriter writer;
14008};
14009
14010struct unicode_format_arg_t {
14011 Py_UCS4 ch;
14012 int flags;
14013 Py_ssize_t width;
14014 int prec;
14015 int sign;
14016};
14017
Guido van Rossumd57fd912000-03-10 22:53:23 +000014018static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014019unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014020{
Victor Stinnera47082312012-10-04 02:19:54 +020014021 Py_ssize_t argidx = ctx->argidx;
14022
14023 if (argidx < ctx->arglen) {
14024 ctx->argidx++;
14025 if (ctx->arglen < 0)
14026 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014027 else
Victor Stinnera47082312012-10-04 02:19:54 +020014028 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014029 }
14030 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032 return NULL;
14033}
14034
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014035/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036
Victor Stinnera47082312012-10-04 02:19:54 +020014037/* Format a float into the writer if the writer is not NULL, or into *p_output
14038 otherwise.
14039
14040 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014041static int
Victor Stinnera47082312012-10-04 02:19:54 +020014042formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14043 PyObject **p_output,
14044 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014045{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014046 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014047 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014048 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014049 int prec;
14050 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014051
Guido van Rossumd57fd912000-03-10 22:53:23 +000014052 x = PyFloat_AsDouble(v);
14053 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014054 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014055
Victor Stinnera47082312012-10-04 02:19:54 +020014056 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014057 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014058 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014059
Victor Stinnera47082312012-10-04 02:19:54 +020014060 if (arg->flags & F_ALT)
14061 dtoa_flags = Py_DTSF_ALT;
14062 else
14063 dtoa_flags = 0;
14064 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014065 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014066 return -1;
14067 len = strlen(p);
14068 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014069 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014070 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014071 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014072 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014073 }
14074 else
14075 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014076 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014077 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014078}
14079
Victor Stinnerd0880d52012-04-27 23:40:13 +020014080/* formatlong() emulates the format codes d, u, o, x and X, and
14081 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14082 * Python's regular ints.
14083 * Return value: a new PyUnicodeObject*, or NULL if error.
14084 * The output string is of the form
14085 * "-"? ("0x" | "0X")? digit+
14086 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14087 * set in flags. The case of hex digits will be correct,
14088 * There will be at least prec digits, zero-filled on the left if
14089 * necessary to get that many.
14090 * val object to be converted
14091 * flags bitmask of format flags; only F_ALT is looked at
14092 * prec minimum number of digits; 0-fill on left if needed
14093 * type a character in [duoxX]; u acts the same as d
14094 *
14095 * CAUTION: o, x and X conversions on regular ints can never
14096 * produce a '-' sign, but can for Python's unbounded ints.
14097 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014098PyObject *
14099_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014100{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014101 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014103 Py_ssize_t i;
14104 int sign; /* 1 if '-', else 0 */
14105 int len; /* number of characters */
14106 Py_ssize_t llen;
14107 int numdigits; /* len == numnondigits + numdigits */
14108 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014109
Victor Stinnerd0880d52012-04-27 23:40:13 +020014110 /* Avoid exceeding SSIZE_T_MAX */
14111 if (prec > INT_MAX-3) {
14112 PyErr_SetString(PyExc_OverflowError,
14113 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014115 }
14116
14117 assert(PyLong_Check(val));
14118
14119 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014120 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014121 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014122 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014123 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014124 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014125 /* int and int subclasses should print numerically when a numeric */
14126 /* format code is used (see issue18780) */
14127 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014128 break;
14129 case 'o':
14130 numnondigits = 2;
14131 result = PyNumber_ToBase(val, 8);
14132 break;
14133 case 'x':
14134 case 'X':
14135 numnondigits = 2;
14136 result = PyNumber_ToBase(val, 16);
14137 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014138 }
14139 if (!result)
14140 return NULL;
14141
14142 assert(unicode_modifiable(result));
14143 assert(PyUnicode_IS_READY(result));
14144 assert(PyUnicode_IS_ASCII(result));
14145
14146 /* To modify the string in-place, there can only be one reference. */
14147 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014148 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014149 PyErr_BadInternalCall();
14150 return NULL;
14151 }
14152 buf = PyUnicode_DATA(result);
14153 llen = PyUnicode_GET_LENGTH(result);
14154 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014155 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014156 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014157 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014158 return NULL;
14159 }
14160 len = (int)llen;
14161 sign = buf[0] == '-';
14162 numnondigits += sign;
14163 numdigits = len - numnondigits;
14164 assert(numdigits > 0);
14165
14166 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014167 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014168 (type == 'o' || type == 'x' || type == 'X'))) {
14169 assert(buf[sign] == '0');
14170 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14171 buf[sign+1] == 'o');
14172 numnondigits -= 2;
14173 buf += 2;
14174 len -= 2;
14175 if (sign)
14176 buf[0] = '-';
14177 assert(len == numnondigits + numdigits);
14178 assert(numdigits > 0);
14179 }
14180
14181 /* Fill with leading zeroes to meet minimum width. */
14182 if (prec > numdigits) {
14183 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14184 numnondigits + prec);
14185 char *b1;
14186 if (!r1) {
14187 Py_DECREF(result);
14188 return NULL;
14189 }
14190 b1 = PyBytes_AS_STRING(r1);
14191 for (i = 0; i < numnondigits; ++i)
14192 *b1++ = *buf++;
14193 for (i = 0; i < prec - numdigits; i++)
14194 *b1++ = '0';
14195 for (i = 0; i < numdigits; i++)
14196 *b1++ = *buf++;
14197 *b1 = '\0';
14198 Py_DECREF(result);
14199 result = r1;
14200 buf = PyBytes_AS_STRING(result);
14201 len = numnondigits + prec;
14202 }
14203
14204 /* Fix up case for hex conversions. */
14205 if (type == 'X') {
14206 /* Need to convert all lower case letters to upper case.
14207 and need to convert 0x to 0X (and -0x to -0X). */
14208 for (i = 0; i < len; i++)
14209 if (buf[i] >= 'a' && buf[i] <= 'x')
14210 buf[i] -= 'a'-'A';
14211 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014212 if (!PyUnicode_Check(result)
14213 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014214 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014215 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014216 Py_DECREF(result);
14217 result = unicode;
14218 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014219 else if (len != PyUnicode_GET_LENGTH(result)) {
14220 if (PyUnicode_Resize(&result, len) < 0)
14221 Py_CLEAR(result);
14222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014223 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014224}
14225
Ethan Furmandf3ed242014-01-05 06:50:30 -080014226/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014227 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014228 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014229 * -1 and raise an exception on error */
14230static int
Victor Stinnera47082312012-10-04 02:19:54 +020014231mainformatlong(PyObject *v,
14232 struct unicode_format_arg_t *arg,
14233 PyObject **p_output,
14234 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014235{
14236 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014237 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014238
14239 if (!PyNumber_Check(v))
14240 goto wrongtype;
14241
Ethan Furman9ab74802014-03-21 06:38:46 -070014242 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014243 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014244 if (type == 'o' || type == 'x' || type == 'X') {
14245 iobj = PyNumber_Index(v);
14246 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014247 if (PyErr_ExceptionMatches(PyExc_TypeError))
14248 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014249 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014250 }
14251 }
14252 else {
14253 iobj = PyNumber_Long(v);
14254 if (iobj == NULL ) {
14255 if (PyErr_ExceptionMatches(PyExc_TypeError))
14256 goto wrongtype;
14257 return -1;
14258 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014259 }
14260 assert(PyLong_Check(iobj));
14261 }
14262 else {
14263 iobj = v;
14264 Py_INCREF(iobj);
14265 }
14266
14267 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014268 && arg->width == -1 && arg->prec == -1
14269 && !(arg->flags & (F_SIGN | F_BLANK))
14270 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014271 {
14272 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014273 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014274 int base;
14275
Victor Stinnera47082312012-10-04 02:19:54 +020014276 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 {
14278 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014279 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 case 'd':
14281 case 'i':
14282 case 'u':
14283 base = 10;
14284 break;
14285 case 'o':
14286 base = 8;
14287 break;
14288 case 'x':
14289 case 'X':
14290 base = 16;
14291 break;
14292 }
14293
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014294 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14295 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014296 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014297 }
14298 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299 return 1;
14300 }
14301
Ethan Furmanb95b5612015-01-23 20:05:18 -080014302 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014303 Py_DECREF(iobj);
14304 if (res == NULL)
14305 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014306 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 return 0;
14308
14309wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014310 switch(type)
14311 {
14312 case 'o':
14313 case 'x':
14314 case 'X':
14315 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014316 "%%%c format: an integer is required, "
14317 "not %.200s",
14318 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014319 break;
14320 default:
14321 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014322 "%%%c format: a number is required, "
14323 "not %.200s",
14324 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014325 break;
14326 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014327 return -1;
14328}
14329
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014330static Py_UCS4
14331formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014332{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014333 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014334 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014335 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014336 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014337 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014338 goto onError;
14339 }
14340 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014341 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014342 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014343 /* make sure number is a type of integer */
14344 if (!PyLong_Check(v)) {
14345 iobj = PyNumber_Index(v);
14346 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014347 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014348 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014349 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014350 Py_DECREF(iobj);
14351 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014352 else {
14353 x = PyLong_AsLong(v);
14354 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014355 if (x == -1 && PyErr_Occurred())
14356 goto onError;
14357
Victor Stinner8faf8212011-12-08 22:14:11 +010014358 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014359 PyErr_SetString(PyExc_OverflowError,
14360 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014361 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014362 }
14363
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014364 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014365 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014366
Benjamin Peterson29060642009-01-31 22:14:21 +000014367 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014368 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014369 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014370 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014371}
14372
Victor Stinnera47082312012-10-04 02:19:54 +020014373/* Parse options of an argument: flags, width, precision.
14374 Handle also "%(name)" syntax.
14375
14376 Return 0 if the argument has been formatted into arg->str.
14377 Return 1 if the argument has been written into ctx->writer,
14378 Raise an exception and return -1 on error. */
14379static int
14380unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14381 struct unicode_format_arg_t *arg)
14382{
14383#define FORMAT_READ(ctx) \
14384 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14385
14386 PyObject *v;
14387
Victor Stinnera47082312012-10-04 02:19:54 +020014388 if (arg->ch == '(') {
14389 /* Get argument value from a dictionary. Example: "%(name)s". */
14390 Py_ssize_t keystart;
14391 Py_ssize_t keylen;
14392 PyObject *key;
14393 int pcount = 1;
14394
14395 if (ctx->dict == NULL) {
14396 PyErr_SetString(PyExc_TypeError,
14397 "format requires a mapping");
14398 return -1;
14399 }
14400 ++ctx->fmtpos;
14401 --ctx->fmtcnt;
14402 keystart = ctx->fmtpos;
14403 /* Skip over balanced parentheses */
14404 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14405 arg->ch = FORMAT_READ(ctx);
14406 if (arg->ch == ')')
14407 --pcount;
14408 else if (arg->ch == '(')
14409 ++pcount;
14410 ctx->fmtpos++;
14411 }
14412 keylen = ctx->fmtpos - keystart - 1;
14413 if (ctx->fmtcnt < 0 || pcount > 0) {
14414 PyErr_SetString(PyExc_ValueError,
14415 "incomplete format key");
14416 return -1;
14417 }
14418 key = PyUnicode_Substring(ctx->fmtstr,
14419 keystart, keystart + keylen);
14420 if (key == NULL)
14421 return -1;
14422 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014423 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014424 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014425 }
14426 ctx->args = PyObject_GetItem(ctx->dict, key);
14427 Py_DECREF(key);
14428 if (ctx->args == NULL)
14429 return -1;
14430 ctx->args_owned = 1;
14431 ctx->arglen = -1;
14432 ctx->argidx = -2;
14433 }
14434
14435 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014436 while (--ctx->fmtcnt >= 0) {
14437 arg->ch = FORMAT_READ(ctx);
14438 ctx->fmtpos++;
14439 switch (arg->ch) {
14440 case '-': arg->flags |= F_LJUST; continue;
14441 case '+': arg->flags |= F_SIGN; continue;
14442 case ' ': arg->flags |= F_BLANK; continue;
14443 case '#': arg->flags |= F_ALT; continue;
14444 case '0': arg->flags |= F_ZERO; continue;
14445 }
14446 break;
14447 }
14448
14449 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014450 if (arg->ch == '*') {
14451 v = unicode_format_getnextarg(ctx);
14452 if (v == NULL)
14453 return -1;
14454 if (!PyLong_Check(v)) {
14455 PyErr_SetString(PyExc_TypeError,
14456 "* wants int");
14457 return -1;
14458 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014459 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014460 if (arg->width == -1 && PyErr_Occurred())
14461 return -1;
14462 if (arg->width < 0) {
14463 arg->flags |= F_LJUST;
14464 arg->width = -arg->width;
14465 }
14466 if (--ctx->fmtcnt >= 0) {
14467 arg->ch = FORMAT_READ(ctx);
14468 ctx->fmtpos++;
14469 }
14470 }
14471 else if (arg->ch >= '0' && arg->ch <= '9') {
14472 arg->width = arg->ch - '0';
14473 while (--ctx->fmtcnt >= 0) {
14474 arg->ch = FORMAT_READ(ctx);
14475 ctx->fmtpos++;
14476 if (arg->ch < '0' || arg->ch > '9')
14477 break;
14478 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14479 mixing signed and unsigned comparison. Since arg->ch is between
14480 '0' and '9', casting to int is safe. */
14481 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14482 PyErr_SetString(PyExc_ValueError,
14483 "width too big");
14484 return -1;
14485 }
14486 arg->width = arg->width*10 + (arg->ch - '0');
14487 }
14488 }
14489
14490 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014491 if (arg->ch == '.') {
14492 arg->prec = 0;
14493 if (--ctx->fmtcnt >= 0) {
14494 arg->ch = FORMAT_READ(ctx);
14495 ctx->fmtpos++;
14496 }
14497 if (arg->ch == '*') {
14498 v = unicode_format_getnextarg(ctx);
14499 if (v == NULL)
14500 return -1;
14501 if (!PyLong_Check(v)) {
14502 PyErr_SetString(PyExc_TypeError,
14503 "* wants int");
14504 return -1;
14505 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014506 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014507 if (arg->prec == -1 && PyErr_Occurred())
14508 return -1;
14509 if (arg->prec < 0)
14510 arg->prec = 0;
14511 if (--ctx->fmtcnt >= 0) {
14512 arg->ch = FORMAT_READ(ctx);
14513 ctx->fmtpos++;
14514 }
14515 }
14516 else if (arg->ch >= '0' && arg->ch <= '9') {
14517 arg->prec = arg->ch - '0';
14518 while (--ctx->fmtcnt >= 0) {
14519 arg->ch = FORMAT_READ(ctx);
14520 ctx->fmtpos++;
14521 if (arg->ch < '0' || arg->ch > '9')
14522 break;
14523 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14524 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014525 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014526 return -1;
14527 }
14528 arg->prec = arg->prec*10 + (arg->ch - '0');
14529 }
14530 }
14531 }
14532
14533 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14534 if (ctx->fmtcnt >= 0) {
14535 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14536 if (--ctx->fmtcnt >= 0) {
14537 arg->ch = FORMAT_READ(ctx);
14538 ctx->fmtpos++;
14539 }
14540 }
14541 }
14542 if (ctx->fmtcnt < 0) {
14543 PyErr_SetString(PyExc_ValueError,
14544 "incomplete format");
14545 return -1;
14546 }
14547 return 0;
14548
14549#undef FORMAT_READ
14550}
14551
14552/* Format one argument. Supported conversion specifiers:
14553
14554 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014555 - "i", "d", "u": int or float
14556 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014557 - "e", "E", "f", "F", "g", "G": float
14558 - "c": int or str (1 character)
14559
Victor Stinner8dbd4212012-12-04 09:30:24 +010014560 When possible, the output is written directly into the Unicode writer
14561 (ctx->writer). A string is created when padding is required.
14562
Victor Stinnera47082312012-10-04 02:19:54 +020014563 Return 0 if the argument has been formatted into *p_str,
14564 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014565 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014566static int
14567unicode_format_arg_format(struct unicode_formatter_t *ctx,
14568 struct unicode_format_arg_t *arg,
14569 PyObject **p_str)
14570{
14571 PyObject *v;
14572 _PyUnicodeWriter *writer = &ctx->writer;
14573
14574 if (ctx->fmtcnt == 0)
14575 ctx->writer.overallocate = 0;
14576
Victor Stinnera47082312012-10-04 02:19:54 +020014577 v = unicode_format_getnextarg(ctx);
14578 if (v == NULL)
14579 return -1;
14580
Victor Stinnera47082312012-10-04 02:19:54 +020014581
14582 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014583 case 's':
14584 case 'r':
14585 case 'a':
14586 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14587 /* Fast path */
14588 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14589 return -1;
14590 return 1;
14591 }
14592
14593 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14594 *p_str = v;
14595 Py_INCREF(*p_str);
14596 }
14597 else {
14598 if (arg->ch == 's')
14599 *p_str = PyObject_Str(v);
14600 else if (arg->ch == 'r')
14601 *p_str = PyObject_Repr(v);
14602 else
14603 *p_str = PyObject_ASCII(v);
14604 }
14605 break;
14606
14607 case 'i':
14608 case 'd':
14609 case 'u':
14610 case 'o':
14611 case 'x':
14612 case 'X':
14613 {
14614 int ret = mainformatlong(v, arg, p_str, writer);
14615 if (ret != 0)
14616 return ret;
14617 arg->sign = 1;
14618 break;
14619 }
14620
14621 case 'e':
14622 case 'E':
14623 case 'f':
14624 case 'F':
14625 case 'g':
14626 case 'G':
14627 if (arg->width == -1 && arg->prec == -1
14628 && !(arg->flags & (F_SIGN | F_BLANK)))
14629 {
14630 /* Fast path */
14631 if (formatfloat(v, arg, NULL, writer) == -1)
14632 return -1;
14633 return 1;
14634 }
14635
14636 arg->sign = 1;
14637 if (formatfloat(v, arg, p_str, NULL) == -1)
14638 return -1;
14639 break;
14640
14641 case 'c':
14642 {
14643 Py_UCS4 ch = formatchar(v);
14644 if (ch == (Py_UCS4) -1)
14645 return -1;
14646 if (arg->width == -1 && arg->prec == -1) {
14647 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014648 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014649 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014650 return 1;
14651 }
14652 *p_str = PyUnicode_FromOrdinal(ch);
14653 break;
14654 }
14655
14656 default:
14657 PyErr_Format(PyExc_ValueError,
14658 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014659 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014660 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14661 (int)arg->ch,
14662 ctx->fmtpos - 1);
14663 return -1;
14664 }
14665 if (*p_str == NULL)
14666 return -1;
14667 assert (PyUnicode_Check(*p_str));
14668 return 0;
14669}
14670
14671static int
14672unicode_format_arg_output(struct unicode_formatter_t *ctx,
14673 struct unicode_format_arg_t *arg,
14674 PyObject *str)
14675{
14676 Py_ssize_t len;
14677 enum PyUnicode_Kind kind;
14678 void *pbuf;
14679 Py_ssize_t pindex;
14680 Py_UCS4 signchar;
14681 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014682 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014683 Py_ssize_t sublen;
14684 _PyUnicodeWriter *writer = &ctx->writer;
14685 Py_UCS4 fill;
14686
14687 fill = ' ';
14688 if (arg->sign && arg->flags & F_ZERO)
14689 fill = '0';
14690
14691 if (PyUnicode_READY(str) == -1)
14692 return -1;
14693
14694 len = PyUnicode_GET_LENGTH(str);
14695 if ((arg->width == -1 || arg->width <= len)
14696 && (arg->prec == -1 || arg->prec >= len)
14697 && !(arg->flags & (F_SIGN | F_BLANK)))
14698 {
14699 /* Fast path */
14700 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14701 return -1;
14702 return 0;
14703 }
14704
14705 /* Truncate the string for "s", "r" and "a" formats
14706 if the precision is set */
14707 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14708 if (arg->prec >= 0 && len > arg->prec)
14709 len = arg->prec;
14710 }
14711
14712 /* Adjust sign and width */
14713 kind = PyUnicode_KIND(str);
14714 pbuf = PyUnicode_DATA(str);
14715 pindex = 0;
14716 signchar = '\0';
14717 if (arg->sign) {
14718 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14719 if (ch == '-' || ch == '+') {
14720 signchar = ch;
14721 len--;
14722 pindex++;
14723 }
14724 else if (arg->flags & F_SIGN)
14725 signchar = '+';
14726 else if (arg->flags & F_BLANK)
14727 signchar = ' ';
14728 else
14729 arg->sign = 0;
14730 }
14731 if (arg->width < len)
14732 arg->width = len;
14733
14734 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014735 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014736 if (!(arg->flags & F_LJUST)) {
14737 if (arg->sign) {
14738 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014739 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014740 }
14741 else {
14742 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014743 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014744 }
14745 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014746 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14747 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014748 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014749 }
14750
Victor Stinnera47082312012-10-04 02:19:54 +020014751 buflen = arg->width;
14752 if (arg->sign && len == arg->width)
14753 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014754 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014755 return -1;
14756
14757 /* Write the sign if needed */
14758 if (arg->sign) {
14759 if (fill != ' ') {
14760 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14761 writer->pos += 1;
14762 }
14763 if (arg->width > len)
14764 arg->width--;
14765 }
14766
14767 /* Write the numeric prefix for "x", "X" and "o" formats
14768 if the alternate form is used.
14769 For example, write "0x" for the "%#x" format. */
14770 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14771 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14772 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14773 if (fill != ' ') {
14774 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14775 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14776 writer->pos += 2;
14777 pindex += 2;
14778 }
14779 arg->width -= 2;
14780 if (arg->width < 0)
14781 arg->width = 0;
14782 len -= 2;
14783 }
14784
14785 /* Pad left with the fill character if needed */
14786 if (arg->width > len && !(arg->flags & F_LJUST)) {
14787 sublen = arg->width - len;
14788 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14789 writer->pos += sublen;
14790 arg->width = len;
14791 }
14792
14793 /* If padding with spaces: write sign if needed and/or numeric prefix if
14794 the alternate form is used */
14795 if (fill == ' ') {
14796 if (arg->sign) {
14797 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14798 writer->pos += 1;
14799 }
14800 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14801 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14802 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14803 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14804 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14805 writer->pos += 2;
14806 pindex += 2;
14807 }
14808 }
14809
14810 /* Write characters */
14811 if (len) {
14812 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14813 str, pindex, len);
14814 writer->pos += len;
14815 }
14816
14817 /* Pad right with the fill character if needed */
14818 if (arg->width > len) {
14819 sublen = arg->width - len;
14820 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14821 writer->pos += sublen;
14822 }
14823 return 0;
14824}
14825
14826/* Helper of PyUnicode_Format(): format one arg.
14827 Return 0 on success, raise an exception and return -1 on error. */
14828static int
14829unicode_format_arg(struct unicode_formatter_t *ctx)
14830{
14831 struct unicode_format_arg_t arg;
14832 PyObject *str;
14833 int ret;
14834
Victor Stinner8dbd4212012-12-04 09:30:24 +010014835 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014836 if (arg.ch == '%') {
14837 ctx->fmtpos++;
14838 ctx->fmtcnt--;
14839 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14840 return -1;
14841 return 0;
14842 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014843 arg.flags = 0;
14844 arg.width = -1;
14845 arg.prec = -1;
14846 arg.sign = 0;
14847 str = NULL;
14848
Victor Stinnera47082312012-10-04 02:19:54 +020014849 ret = unicode_format_arg_parse(ctx, &arg);
14850 if (ret == -1)
14851 return -1;
14852
14853 ret = unicode_format_arg_format(ctx, &arg, &str);
14854 if (ret == -1)
14855 return -1;
14856
14857 if (ret != 1) {
14858 ret = unicode_format_arg_output(ctx, &arg, str);
14859 Py_DECREF(str);
14860 if (ret == -1)
14861 return -1;
14862 }
14863
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014864 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014865 PyErr_SetString(PyExc_TypeError,
14866 "not all arguments converted during string formatting");
14867 return -1;
14868 }
14869 return 0;
14870}
14871
Alexander Belopolsky40018472011-02-26 01:02:56 +000014872PyObject *
14873PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014874{
Victor Stinnera47082312012-10-04 02:19:54 +020014875 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014876
Guido van Rossumd57fd912000-03-10 22:53:23 +000014877 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014878 PyErr_BadInternalCall();
14879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014880 }
Victor Stinnera47082312012-10-04 02:19:54 +020014881
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014882 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014883 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014884
14885 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014886 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14887 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14888 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14889 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014890
Victor Stinner8f674cc2013-04-17 23:02:17 +020014891 _PyUnicodeWriter_Init(&ctx.writer);
14892 ctx.writer.min_length = ctx.fmtcnt + 100;
14893 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014894
Guido van Rossumd57fd912000-03-10 22:53:23 +000014895 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014896 ctx.arglen = PyTuple_Size(args);
14897 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014898 }
14899 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014900 ctx.arglen = -1;
14901 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014902 }
Victor Stinnera47082312012-10-04 02:19:54 +020014903 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014904 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014905 ctx.dict = args;
14906 else
14907 ctx.dict = NULL;
14908 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014909
Victor Stinnera47082312012-10-04 02:19:54 +020014910 while (--ctx.fmtcnt >= 0) {
14911 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014912 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014913
14914 nonfmtpos = ctx.fmtpos++;
14915 while (ctx.fmtcnt >= 0 &&
14916 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14917 ctx.fmtpos++;
14918 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014919 }
Victor Stinnera47082312012-10-04 02:19:54 +020014920 if (ctx.fmtcnt < 0) {
14921 ctx.fmtpos--;
14922 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014923 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014924
Victor Stinnercfc4c132013-04-03 01:48:39 +020014925 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14926 nonfmtpos, ctx.fmtpos) < 0)
14927 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014928 }
14929 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014930 ctx.fmtpos++;
14931 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014932 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014933 }
14934 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014935
Victor Stinnera47082312012-10-04 02:19:54 +020014936 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014937 PyErr_SetString(PyExc_TypeError,
14938 "not all arguments converted during string formatting");
14939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940 }
14941
Victor Stinnera47082312012-10-04 02:19:54 +020014942 if (ctx.args_owned) {
14943 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944 }
Victor Stinnera47082312012-10-04 02:19:54 +020014945 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946
Benjamin Peterson29060642009-01-31 22:14:21 +000014947 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014948 _PyUnicodeWriter_Dealloc(&ctx.writer);
14949 if (ctx.args_owned) {
14950 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014951 }
14952 return NULL;
14953}
14954
Jeremy Hylton938ace62002-07-17 16:30:39 +000014955static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014956unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14957
Tim Peters6d6c1a32001-08-02 04:15:00 +000014958static PyObject *
14959unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14960{
Benjamin Peterson29060642009-01-31 22:14:21 +000014961 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014962 static char *kwlist[] = {"object", "encoding", "errors", 0};
14963 char *encoding = NULL;
14964 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014965
Benjamin Peterson14339b62009-01-31 16:36:08 +000014966 if (type != &PyUnicode_Type)
14967 return unicode_subtype_new(type, args, kwds);
14968 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014969 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014970 return NULL;
14971 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014972 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 if (encoding == NULL && errors == NULL)
14974 return PyObject_Str(x);
14975 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014976 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014977}
14978
Guido van Rossume023fe02001-08-30 03:12:59 +000014979static PyObject *
14980unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14981{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014982 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014983 Py_ssize_t length, char_size;
14984 int share_wstr, share_utf8;
14985 unsigned int kind;
14986 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014987
Benjamin Peterson14339b62009-01-31 16:36:08 +000014988 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014989
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014990 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014991 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014993 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014994 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014995 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014996 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014997 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014998
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014999 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015000 if (self == NULL) {
15001 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015002 return NULL;
15003 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015004 kind = PyUnicode_KIND(unicode);
15005 length = PyUnicode_GET_LENGTH(unicode);
15006
15007 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015008#ifdef Py_DEBUG
15009 _PyUnicode_HASH(self) = -1;
15010#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015011 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015012#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015013 _PyUnicode_STATE(self).interned = 0;
15014 _PyUnicode_STATE(self).kind = kind;
15015 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015016 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015017 _PyUnicode_STATE(self).ready = 1;
15018 _PyUnicode_WSTR(self) = NULL;
15019 _PyUnicode_UTF8_LENGTH(self) = 0;
15020 _PyUnicode_UTF8(self) = NULL;
15021 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015022 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015023
15024 share_utf8 = 0;
15025 share_wstr = 0;
15026 if (kind == PyUnicode_1BYTE_KIND) {
15027 char_size = 1;
15028 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15029 share_utf8 = 1;
15030 }
15031 else if (kind == PyUnicode_2BYTE_KIND) {
15032 char_size = 2;
15033 if (sizeof(wchar_t) == 2)
15034 share_wstr = 1;
15035 }
15036 else {
15037 assert(kind == PyUnicode_4BYTE_KIND);
15038 char_size = 4;
15039 if (sizeof(wchar_t) == 4)
15040 share_wstr = 1;
15041 }
15042
15043 /* Ensure we won't overflow the length. */
15044 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15045 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015046 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015048 data = PyObject_MALLOC((length + 1) * char_size);
15049 if (data == NULL) {
15050 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015051 goto onError;
15052 }
15053
Victor Stinnerc3c74152011-10-02 20:39:55 +020015054 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 if (share_utf8) {
15056 _PyUnicode_UTF8_LENGTH(self) = length;
15057 _PyUnicode_UTF8(self) = data;
15058 }
15059 if (share_wstr) {
15060 _PyUnicode_WSTR_LENGTH(self) = length;
15061 _PyUnicode_WSTR(self) = (wchar_t *)data;
15062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015063
Christian Heimesf051e432016-09-13 20:22:02 +020015064 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015065 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015066 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015067#ifdef Py_DEBUG
15068 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15069#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015070 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015071 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015072
15073onError:
15074 Py_DECREF(unicode);
15075 Py_DECREF(self);
15076 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015077}
15078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015079PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015080"str(object='') -> str\n\
15081str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015082\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015083Create a new string object from the given object. If encoding or\n\
15084errors is specified, then the object must expose a data buffer\n\
15085that will be decoded using the given encoding and error handler.\n\
15086Otherwise, returns the result of object.__str__() (if defined)\n\
15087or repr(object).\n\
15088encoding defaults to sys.getdefaultencoding().\n\
15089errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015090
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015091static PyObject *unicode_iter(PyObject *seq);
15092
Guido van Rossumd57fd912000-03-10 22:53:23 +000015093PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015094 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015095 "str", /* tp_name */
15096 sizeof(PyUnicodeObject), /* tp_basicsize */
15097 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015098 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015099 (destructor)unicode_dealloc, /* tp_dealloc */
15100 0, /* tp_print */
15101 0, /* tp_getattr */
15102 0, /* tp_setattr */
15103 0, /* tp_reserved */
15104 unicode_repr, /* tp_repr */
15105 &unicode_as_number, /* tp_as_number */
15106 &unicode_as_sequence, /* tp_as_sequence */
15107 &unicode_as_mapping, /* tp_as_mapping */
15108 (hashfunc) unicode_hash, /* tp_hash*/
15109 0, /* tp_call*/
15110 (reprfunc) unicode_str, /* tp_str */
15111 PyObject_GenericGetAttr, /* tp_getattro */
15112 0, /* tp_setattro */
15113 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015115 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15116 unicode_doc, /* tp_doc */
15117 0, /* tp_traverse */
15118 0, /* tp_clear */
15119 PyUnicode_RichCompare, /* tp_richcompare */
15120 0, /* tp_weaklistoffset */
15121 unicode_iter, /* tp_iter */
15122 0, /* tp_iternext */
15123 unicode_methods, /* tp_methods */
15124 0, /* tp_members */
15125 0, /* tp_getset */
15126 &PyBaseObject_Type, /* tp_base */
15127 0, /* tp_dict */
15128 0, /* tp_descr_get */
15129 0, /* tp_descr_set */
15130 0, /* tp_dictoffset */
15131 0, /* tp_init */
15132 0, /* tp_alloc */
15133 unicode_new, /* tp_new */
15134 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015135};
15136
15137/* Initialize the Unicode implementation */
15138
Victor Stinner3a50e702011-10-18 21:21:00 +020015139int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015140{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015141 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015143 0x000A, /* LINE FEED */
15144 0x000D, /* CARRIAGE RETURN */
15145 0x001C, /* FILE SEPARATOR */
15146 0x001D, /* GROUP SEPARATOR */
15147 0x001E, /* RECORD SEPARATOR */
15148 0x0085, /* NEXT LINE */
15149 0x2028, /* LINE SEPARATOR */
15150 0x2029, /* PARAGRAPH SEPARATOR */
15151 };
15152
Fred Drakee4315f52000-05-09 19:53:39 +000015153 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015154 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015155 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015156 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015157 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015158
Guido van Rossumcacfc072002-05-24 19:01:59 +000015159 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015160 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015161
15162 /* initialize the linebreak bloom filter */
15163 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015164 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015165 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015166
Christian Heimes26532f72013-07-20 14:57:16 +020015167 if (PyType_Ready(&EncodingMapType) < 0)
15168 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015169
Benjamin Petersonc4311282012-10-30 23:21:10 -040015170 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15171 Py_FatalError("Can't initialize field name iterator type");
15172
15173 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15174 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015175
Victor Stinner3a50e702011-10-18 21:21:00 +020015176 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177}
15178
15179/* Finalize the Unicode implementation */
15180
Christian Heimesa156e092008-02-16 07:38:31 +000015181int
15182PyUnicode_ClearFreeList(void)
15183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015184 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015185}
15186
Guido van Rossumd57fd912000-03-10 22:53:23 +000015187void
Thomas Wouters78890102000-07-22 19:25:51 +000015188_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015189{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015190 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015191
Serhiy Storchaka05997252013-01-26 12:14:02 +020015192 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015193
Serhiy Storchaka05997252013-01-26 12:14:02 +020015194 for (i = 0; i < 256; i++)
15195 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015196 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015197 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015198}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015199
Walter Dörwald16807132007-05-25 13:52:07 +000015200void
15201PyUnicode_InternInPlace(PyObject **p)
15202{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015203 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015204 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015205#ifdef Py_DEBUG
15206 assert(s != NULL);
15207 assert(_PyUnicode_CHECK(s));
15208#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015209 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015210 return;
15211#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 /* If it's a subclass, we don't really know what putting
15213 it in the interned dict might do. */
15214 if (!PyUnicode_CheckExact(s))
15215 return;
15216 if (PyUnicode_CHECK_INTERNED(s))
15217 return;
15218 if (interned == NULL) {
15219 interned = PyDict_New();
15220 if (interned == NULL) {
15221 PyErr_Clear(); /* Don't leave an exception */
15222 return;
15223 }
15224 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015225 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015226 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015228 if (t == NULL) {
15229 PyErr_Clear();
15230 return;
15231 }
15232 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015233 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015234 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015235 return;
15236 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015237 /* The two references in interned are not counted by refcnt.
15238 The deallocator will take care of this */
15239 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015240 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015241}
15242
15243void
15244PyUnicode_InternImmortal(PyObject **p)
15245{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 PyUnicode_InternInPlace(p);
15247 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015248 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 Py_INCREF(*p);
15250 }
Walter Dörwald16807132007-05-25 13:52:07 +000015251}
15252
15253PyObject *
15254PyUnicode_InternFromString(const char *cp)
15255{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 PyObject *s = PyUnicode_FromString(cp);
15257 if (s == NULL)
15258 return NULL;
15259 PyUnicode_InternInPlace(&s);
15260 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015261}
15262
Alexander Belopolsky40018472011-02-26 01:02:56 +000015263void
15264_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015265{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015267 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 Py_ssize_t i, n;
15269 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015270
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 if (interned == NULL || !PyDict_Check(interned))
15272 return;
15273 keys = PyDict_Keys(interned);
15274 if (keys == NULL || !PyList_Check(keys)) {
15275 PyErr_Clear();
15276 return;
15277 }
Walter Dörwald16807132007-05-25 13:52:07 +000015278
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15280 detector, interned unicode strings are not forcibly deallocated;
15281 rather, we give them their stolen references back, and then clear
15282 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015283
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 n = PyList_GET_SIZE(keys);
15285 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015286 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015288 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015289 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015290 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015292 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 case SSTATE_NOT_INTERNED:
15294 /* XXX Shouldn't happen */
15295 break;
15296 case SSTATE_INTERNED_IMMORTAL:
15297 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015298 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 break;
15300 case SSTATE_INTERNED_MORTAL:
15301 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015302 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 break;
15304 default:
15305 Py_FatalError("Inconsistent interned string state.");
15306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015307 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 }
15309 fprintf(stderr, "total size of all interned strings: "
15310 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15311 "mortal/immortal\n", mortal_size, immortal_size);
15312 Py_DECREF(keys);
15313 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015314 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015315}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015316
15317
15318/********************* Unicode Iterator **************************/
15319
15320typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015321 PyObject_HEAD
15322 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015323 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015324} unicodeiterobject;
15325
15326static void
15327unicodeiter_dealloc(unicodeiterobject *it)
15328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 _PyObject_GC_UNTRACK(it);
15330 Py_XDECREF(it->it_seq);
15331 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015332}
15333
15334static int
15335unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15336{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 Py_VISIT(it->it_seq);
15338 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015339}
15340
15341static PyObject *
15342unicodeiter_next(unicodeiterobject *it)
15343{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015344 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015345
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 assert(it != NULL);
15347 seq = it->it_seq;
15348 if (seq == NULL)
15349 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015350 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015352 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15353 int kind = PyUnicode_KIND(seq);
15354 void *data = PyUnicode_DATA(seq);
15355 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15356 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 if (item != NULL)
15358 ++it->it_index;
15359 return item;
15360 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015361
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015363 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015365}
15366
15367static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015368unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015369{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 Py_ssize_t len = 0;
15371 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015372 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015374}
15375
15376PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15377
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015378static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015379unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015380{
15381 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015382 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015383 it->it_seq, it->it_index);
15384 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015385 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015386 if (u == NULL)
15387 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015388 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015389 }
15390}
15391
15392PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15393
15394static PyObject *
15395unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15396{
15397 Py_ssize_t index = PyLong_AsSsize_t(state);
15398 if (index == -1 && PyErr_Occurred())
15399 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015400 if (it->it_seq != NULL) {
15401 if (index < 0)
15402 index = 0;
15403 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15404 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15405 it->it_index = index;
15406 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015407 Py_RETURN_NONE;
15408}
15409
15410PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15411
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015412static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015414 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015415 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15416 reduce_doc},
15417 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15418 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015420};
15421
15422PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15424 "str_iterator", /* tp_name */
15425 sizeof(unicodeiterobject), /* tp_basicsize */
15426 0, /* tp_itemsize */
15427 /* methods */
15428 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15429 0, /* tp_print */
15430 0, /* tp_getattr */
15431 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015432 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 0, /* tp_repr */
15434 0, /* tp_as_number */
15435 0, /* tp_as_sequence */
15436 0, /* tp_as_mapping */
15437 0, /* tp_hash */
15438 0, /* tp_call */
15439 0, /* tp_str */
15440 PyObject_GenericGetAttr, /* tp_getattro */
15441 0, /* tp_setattro */
15442 0, /* tp_as_buffer */
15443 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15444 0, /* tp_doc */
15445 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15446 0, /* tp_clear */
15447 0, /* tp_richcompare */
15448 0, /* tp_weaklistoffset */
15449 PyObject_SelfIter, /* tp_iter */
15450 (iternextfunc)unicodeiter_next, /* tp_iternext */
15451 unicodeiter_methods, /* tp_methods */
15452 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015453};
15454
15455static PyObject *
15456unicode_iter(PyObject *seq)
15457{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015458 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015459
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 if (!PyUnicode_Check(seq)) {
15461 PyErr_BadInternalCall();
15462 return NULL;
15463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015464 if (PyUnicode_READY(seq) == -1)
15465 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015466 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15467 if (it == NULL)
15468 return NULL;
15469 it->it_index = 0;
15470 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015471 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015472 _PyObject_GC_TRACK(it);
15473 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015474}
15475
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015476
15477size_t
15478Py_UNICODE_strlen(const Py_UNICODE *u)
15479{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015480 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015481}
15482
15483Py_UNICODE*
15484Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15485{
15486 Py_UNICODE *u = s1;
15487 while ((*u++ = *s2++));
15488 return s1;
15489}
15490
15491Py_UNICODE*
15492Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15493{
15494 Py_UNICODE *u = s1;
15495 while ((*u++ = *s2++))
15496 if (n-- == 0)
15497 break;
15498 return s1;
15499}
15500
15501Py_UNICODE*
15502Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15503{
15504 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015505 u1 += wcslen(u1);
15506 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015507 return s1;
15508}
15509
15510int
15511Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15512{
15513 while (*s1 && *s2 && *s1 == *s2)
15514 s1++, s2++;
15515 if (*s1 && *s2)
15516 return (*s1 < *s2) ? -1 : +1;
15517 if (*s1)
15518 return 1;
15519 if (*s2)
15520 return -1;
15521 return 0;
15522}
15523
15524int
15525Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15526{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015527 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015528 for (; n != 0; n--) {
15529 u1 = *s1;
15530 u2 = *s2;
15531 if (u1 != u2)
15532 return (u1 < u2) ? -1 : +1;
15533 if (u1 == '\0')
15534 return 0;
15535 s1++;
15536 s2++;
15537 }
15538 return 0;
15539}
15540
15541Py_UNICODE*
15542Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15543{
15544 const Py_UNICODE *p;
15545 for (p = s; *p; p++)
15546 if (*p == c)
15547 return (Py_UNICODE*)p;
15548 return NULL;
15549}
15550
15551Py_UNICODE*
15552Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15553{
15554 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015555 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015556 while (p != s) {
15557 p--;
15558 if (*p == c)
15559 return (Py_UNICODE*)p;
15560 }
15561 return NULL;
15562}
Victor Stinner331ea922010-08-10 16:37:20 +000015563
Victor Stinner71133ff2010-09-01 23:43:53 +000015564Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015565PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015566{
Victor Stinner577db2c2011-10-11 22:12:48 +020015567 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015568 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015570 if (!PyUnicode_Check(unicode)) {
15571 PyErr_BadArgument();
15572 return NULL;
15573 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015574 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015575 if (u == NULL)
15576 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015577 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015578 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015579 PyErr_NoMemory();
15580 return NULL;
15581 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015582 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015583 size *= sizeof(Py_UNICODE);
15584 copy = PyMem_Malloc(size);
15585 if (copy == NULL) {
15586 PyErr_NoMemory();
15587 return NULL;
15588 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015589 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015590 return copy;
15591}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015592
Georg Brandl66c221e2010-10-14 07:04:07 +000015593/* A _string module, to export formatter_parser and formatter_field_name_split
15594 to the string.Formatter class implemented in Python. */
15595
15596static PyMethodDef _string_methods[] = {
15597 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15598 METH_O, PyDoc_STR("split the argument as a field name")},
15599 {"formatter_parser", (PyCFunction) formatter_parser,
15600 METH_O, PyDoc_STR("parse the argument as a format string")},
15601 {NULL, NULL}
15602};
15603
15604static struct PyModuleDef _string_module = {
15605 PyModuleDef_HEAD_INIT,
15606 "_string",
15607 PyDoc_STR("string helper module"),
15608 0,
15609 _string_methods,
15610 NULL,
15611 NULL,
15612 NULL,
15613 NULL
15614};
15615
15616PyMODINIT_FUNC
15617PyInit__string(void)
15618{
15619 return PyModule_Create(&_string_module);
15620}
15621
15622
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015623#ifdef __cplusplus
15624}
15625#endif