blob: 3e61c9c370104dc52b8a5f207919d8b27a189b9d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner3d4226a2018-08-29 22:21:32 +0200321_Py_error_handler
322_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200323{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200325 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200326 }
327 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200328 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200329 }
330 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200331 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200332 }
333 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200334 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200335 }
336 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200337 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200340 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
Victor Stinner50149202015-09-22 00:26:54 +0200345 return _Py_ERROR_OTHER;
346}
347
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300348/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
349 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000351PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000352{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000353#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000355#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 /* This is actually an illegal character, so it should
357 not be passed to unichr. */
358 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000359#endif
360}
361
Victor Stinner910337b2011-10-03 03:20:16 +0200362#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200363int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100364_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200365{
366 PyASCIIObject *ascii;
367 unsigned int kind;
368
369 assert(PyUnicode_Check(op));
370
371 ascii = (PyASCIIObject *)op;
372 kind = ascii->state.kind;
373
Victor Stinnera3b334d2011-10-03 13:53:37 +0200374 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200375 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
377 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200378 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200379 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200380 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200381
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 if (ascii->state.compact == 1) {
383 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200384 assert(kind == PyUnicode_1BYTE_KIND
385 || kind == PyUnicode_2BYTE_KIND
386 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200388 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200389 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100390 }
391 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200392 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
393
394 data = unicode->data.any;
395 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->length == 0);
397 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ascii == 0);
400 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100401 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200402 assert(ascii->wstr != NULL);
403 assert(data == NULL);
404 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200405 }
406 else {
407 assert(kind == PyUnicode_1BYTE_KIND
408 || kind == PyUnicode_2BYTE_KIND
409 || kind == PyUnicode_4BYTE_KIND);
410 assert(ascii->state.compact == 0);
411 assert(ascii->state.ready == 1);
412 assert(data != NULL);
413 if (ascii->state.ascii) {
414 assert (compact->utf8 == data);
415 assert (compact->utf8_length == ascii->length);
416 }
417 else
418 assert (compact->utf8 != data);
419 }
420 }
421 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200422 if (
423#if SIZEOF_WCHAR_T == 2
424 kind == PyUnicode_2BYTE_KIND
425#else
426 kind == PyUnicode_4BYTE_KIND
427#endif
428 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200429 {
430 assert(ascii->wstr == data);
431 assert(compact->wstr_length == ascii->length);
432 } else
433 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200434 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200435
436 if (compact->utf8 == NULL)
437 assert(compact->utf8_length == 0);
438 if (ascii->wstr == NULL)
439 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200440 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 /* check that the best kind is used */
442 if (check_content && kind != PyUnicode_WCHAR_KIND)
443 {
444 Py_ssize_t i;
445 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200446 void *data;
447 Py_UCS4 ch;
448
449 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200450 for (i=0; i < ascii->length; i++)
451 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200452 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 if (ch > maxchar)
454 maxchar = ch;
455 }
456 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100457 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200458 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 assert(maxchar <= 255);
460 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 else
462 assert(maxchar < 128);
463 }
Victor Stinner77faf692011-11-20 18:56:05 +0100464 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 assert(maxchar <= 0xFFFF);
467 }
468 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200469 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100470 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200472 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200473 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400474 return 1;
475}
Victor Stinner910337b2011-10-03 03:20:16 +0200476#endif
477
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478static PyObject*
479unicode_result_wchar(PyObject *unicode)
480{
481#ifndef Py_DEBUG
482 Py_ssize_t len;
483
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100484 len = _PyUnicode_WSTR_LENGTH(unicode);
485 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100486 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200487 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 }
489
490 if (len == 1) {
491 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100492 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100493 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
494 Py_DECREF(unicode);
495 return latin1_char;
496 }
497 }
498
499 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200500 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 return NULL;
502 }
503#else
Victor Stinneraa771272012-10-04 02:32:58 +0200504 assert(Py_REFCNT(unicode) == 1);
505
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100506 /* don't make the result ready in debug mode to ensure that the caller
507 makes the string ready before using it */
508 assert(_PyUnicode_CheckConsistency(unicode, 1));
509#endif
510 return unicode;
511}
512
513static PyObject*
514unicode_result_ready(PyObject *unicode)
515{
516 Py_ssize_t length;
517
518 length = PyUnicode_GET_LENGTH(unicode);
519 if (length == 0) {
520 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100521 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200522 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100523 }
524 return unicode_empty;
525 }
526
527 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200528 void *data = PyUnicode_DATA(unicode);
529 int kind = PyUnicode_KIND(unicode);
530 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100531 if (ch < 256) {
532 PyObject *latin1_char = unicode_latin1[ch];
533 if (latin1_char != NULL) {
534 if (unicode != latin1_char) {
535 Py_INCREF(latin1_char);
536 Py_DECREF(unicode);
537 }
538 return latin1_char;
539 }
540 else {
541 assert(_PyUnicode_CheckConsistency(unicode, 1));
542 Py_INCREF(unicode);
543 unicode_latin1[ch] = unicode;
544 return unicode;
545 }
546 }
547 }
548
549 assert(_PyUnicode_CheckConsistency(unicode, 1));
550 return unicode;
551}
552
553static PyObject*
554unicode_result(PyObject *unicode)
555{
556 assert(_PyUnicode_CHECK(unicode));
557 if (PyUnicode_IS_READY(unicode))
558 return unicode_result_ready(unicode);
559 else
560 return unicode_result_wchar(unicode);
561}
562
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563static PyObject*
564unicode_result_unchanged(PyObject *unicode)
565{
566 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500567 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100568 return NULL;
569 Py_INCREF(unicode);
570 return unicode;
571 }
572 else
573 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100574 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100575}
576
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200577/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
578 ASCII, Latin1, UTF-8, etc. */
579static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200580backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200581 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
582{
Victor Stinnerad771582015-10-09 12:38:53 +0200583 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200584 Py_UCS4 ch;
585 enum PyUnicode_Kind kind;
586 void *data;
587
588 assert(PyUnicode_IS_READY(unicode));
589 kind = PyUnicode_KIND(unicode);
590 data = PyUnicode_DATA(unicode);
591
592 size = 0;
593 /* determine replacement size */
594 for (i = collstart; i < collend; ++i) {
595 Py_ssize_t incr;
596
597 ch = PyUnicode_READ(kind, data, i);
598 if (ch < 0x100)
599 incr = 2+2;
600 else if (ch < 0x10000)
601 incr = 2+4;
602 else {
603 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200604 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605 }
606 if (size > PY_SSIZE_T_MAX - incr) {
607 PyErr_SetString(PyExc_OverflowError,
608 "encoded result is too long for a Python string");
609 return NULL;
610 }
611 size += incr;
612 }
613
Victor Stinnerad771582015-10-09 12:38:53 +0200614 str = _PyBytesWriter_Prepare(writer, str, size);
615 if (str == NULL)
616 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200617
618 /* generate replacement */
619 for (i = collstart; i < collend; ++i) {
620 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200621 *str++ = '\\';
622 if (ch >= 0x00010000) {
623 *str++ = 'U';
624 *str++ = Py_hexdigits[(ch>>28)&0xf];
625 *str++ = Py_hexdigits[(ch>>24)&0xf];
626 *str++ = Py_hexdigits[(ch>>20)&0xf];
627 *str++ = Py_hexdigits[(ch>>16)&0xf];
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200630 }
Victor Stinner797485e2015-10-09 03:17:30 +0200631 else if (ch >= 0x100) {
632 *str++ = 'u';
633 *str++ = Py_hexdigits[(ch>>12)&0xf];
634 *str++ = Py_hexdigits[(ch>>8)&0xf];
635 }
636 else
637 *str++ = 'x';
638 *str++ = Py_hexdigits[(ch>>4)&0xf];
639 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200640 }
641 return str;
642}
643
644/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
645 ASCII, Latin1, UTF-8, etc. */
646static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200647xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200648 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
649{
Victor Stinnerad771582015-10-09 12:38:53 +0200650 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200651 Py_UCS4 ch;
652 enum PyUnicode_Kind kind;
653 void *data;
654
655 assert(PyUnicode_IS_READY(unicode));
656 kind = PyUnicode_KIND(unicode);
657 data = PyUnicode_DATA(unicode);
658
659 size = 0;
660 /* determine replacement size */
661 for (i = collstart; i < collend; ++i) {
662 Py_ssize_t incr;
663
664 ch = PyUnicode_READ(kind, data, i);
665 if (ch < 10)
666 incr = 2+1+1;
667 else if (ch < 100)
668 incr = 2+2+1;
669 else if (ch < 1000)
670 incr = 2+3+1;
671 else if (ch < 10000)
672 incr = 2+4+1;
673 else if (ch < 100000)
674 incr = 2+5+1;
675 else if (ch < 1000000)
676 incr = 2+6+1;
677 else {
678 assert(ch <= MAX_UNICODE);
679 incr = 2+7+1;
680 }
681 if (size > PY_SSIZE_T_MAX - incr) {
682 PyErr_SetString(PyExc_OverflowError,
683 "encoded result is too long for a Python string");
684 return NULL;
685 }
686 size += incr;
687 }
688
Victor Stinnerad771582015-10-09 12:38:53 +0200689 str = _PyBytesWriter_Prepare(writer, str, size);
690 if (str == NULL)
691 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200692
693 /* generate replacement */
694 for (i = collstart; i < collend; ++i) {
695 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
696 }
697 return str;
698}
699
Thomas Wouters477c8d52006-05-27 19:21:47 +0000700/* --- Bloom Filters ----------------------------------------------------- */
701
702/* stuff to implement simple "bloom filters" for Unicode characters.
703 to keep things simple, we use a single bitmask, using the least 5
704 bits from each unicode characters as the bit index. */
705
706/* the linebreak mask is set up by Unicode_Init below */
707
Antoine Pitrouf068f942010-01-13 14:19:12 +0000708#if LONG_BIT >= 128
709#define BLOOM_WIDTH 128
710#elif LONG_BIT >= 64
711#define BLOOM_WIDTH 64
712#elif LONG_BIT >= 32
713#define BLOOM_WIDTH 32
714#else
715#error "LONG_BIT is smaller than 32"
716#endif
717
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718#define BLOOM_MASK unsigned long
719
Serhiy Storchaka05997252013-01-26 12:14:02 +0200720static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000721
Antoine Pitrouf068f942010-01-13 14:19:12 +0000722#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000723
Benjamin Peterson29060642009-01-31 22:14:21 +0000724#define BLOOM_LINEBREAK(ch) \
725 ((ch) < 128U ? ascii_linebreak[(ch)] : \
726 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000727
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700728static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730{
Victor Stinnera85af502013-04-09 21:53:54 +0200731#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
732 do { \
733 TYPE *data = (TYPE *)PTR; \
734 TYPE *end = data + LEN; \
735 Py_UCS4 ch; \
736 for (; data != end; data++) { \
737 ch = *data; \
738 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
739 } \
740 break; \
741 } while (0)
742
Thomas Wouters477c8d52006-05-27 19:21:47 +0000743 /* calculate simple bloom-style bitmask for a given unicode string */
744
Antoine Pitrouf068f942010-01-13 14:19:12 +0000745 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000746
747 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200748 switch (kind) {
749 case PyUnicode_1BYTE_KIND:
750 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
751 break;
752 case PyUnicode_2BYTE_KIND:
753 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
754 break;
755 case PyUnicode_4BYTE_KIND:
756 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
757 break;
758 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700759 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200760 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000761 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200762
763#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000764}
765
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300766static int
767ensure_unicode(PyObject *obj)
768{
769 if (!PyUnicode_Check(obj)) {
770 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +0200771 "must be str, not %T", obj);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300772 return -1;
773 }
774 return PyUnicode_READY(obj);
775}
776
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200777/* Compilation of templated routines */
778
779#include "stringlib/asciilib.h"
780#include "stringlib/fastsearch.h"
781#include "stringlib/partition.h"
782#include "stringlib/split.h"
783#include "stringlib/count.h"
784#include "stringlib/find.h"
785#include "stringlib/find_max_char.h"
786#include "stringlib/localeutil.h"
787#include "stringlib/undef.h"
788
789#include "stringlib/ucs1lib.h"
790#include "stringlib/fastsearch.h"
791#include "stringlib/partition.h"
792#include "stringlib/split.h"
793#include "stringlib/count.h"
794#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300795#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200796#include "stringlib/find_max_char.h"
797#include "stringlib/localeutil.h"
798#include "stringlib/undef.h"
799
800#include "stringlib/ucs2lib.h"
801#include "stringlib/fastsearch.h"
802#include "stringlib/partition.h"
803#include "stringlib/split.h"
804#include "stringlib/count.h"
805#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300806#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200807#include "stringlib/find_max_char.h"
808#include "stringlib/localeutil.h"
809#include "stringlib/undef.h"
810
811#include "stringlib/ucs4lib.h"
812#include "stringlib/fastsearch.h"
813#include "stringlib/partition.h"
814#include "stringlib/split.h"
815#include "stringlib/count.h"
816#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300817#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200818#include "stringlib/find_max_char.h"
819#include "stringlib/localeutil.h"
820#include "stringlib/undef.h"
821
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200822#include "stringlib/unicodedefs.h"
823#include "stringlib/fastsearch.h"
824#include "stringlib/count.h"
825#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100826#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200827
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828/* --- Unicode Object ----------------------------------------------------- */
829
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700830static inline Py_ssize_t
831findchar(const void *s, int kind,
832 Py_ssize_t size, Py_UCS4 ch,
833 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200835 switch (kind) {
836 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200837 if ((Py_UCS1) ch != ch)
838 return -1;
839 if (direction > 0)
840 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
841 else
842 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200843 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200844 if ((Py_UCS2) ch != ch)
845 return -1;
846 if (direction > 0)
847 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
848 else
849 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200850 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200851 if (direction > 0)
852 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
853 else
854 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200855 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700856 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858}
859
Victor Stinnerafffce42012-10-03 23:03:17 +0200860#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000861/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
Victor Stinnerfe226c02011-10-03 03:52:20 +0200879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100886 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200887#ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
Victor Stinner79891572012-05-03 13:43:07 +0200891 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200892 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100893 assert(PyUnicode_IS_COMPACT(unicode));
894
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200895 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100896 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901
Victor Stinnerfe226c02011-10-03 03:52:20 +0200902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
Victor Stinner84def372011-12-11 20:04:56 +0100913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100917 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100918 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 PyErr_NoMemory();
920 return NULL;
921 }
Victor Stinner84def372011-12-11 20:04:56 +0100922 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200923 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100924
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200926 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100928 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100936 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200937#ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200942 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 return unicode;
944}
945
Alexander Belopolsky40018472011-02-26 01:02:56 +0000946static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200947resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948{
Victor Stinner95663112011-10-04 01:03:50 +0200949 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200952 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200956 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200958#ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961
962 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200963 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
Victor Stinner7a9105a2011-12-12 00:13:42 +0100973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200986 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200987 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200991 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200996#ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998#endif
Victor Stinner95663112011-10-04 01:03:50 +0200999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001000 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 }
Victor Stinner95663112011-10-04 01:03:50 +02001004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001008 PyErr_NoMemory();
1009 return -1;
1010 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001012 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001021 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 return 0;
1023}
1024
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001031
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001032 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001050 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Christian Heimesf051e432016-09-13 20:22:02 +02001437 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001506 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001507 }
1508 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001509 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001510 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001511 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 Py_ssize_t i;
1513
Victor Stinnera0702ab2011-09-29 14:14:38 +02001514 for (i=0; i < how_many; i++) {
1515 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001516 if (ch > to_maxchar)
1517 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001518 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1519 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 }
1521 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001522 return 0;
1523}
1524
Victor Stinnerd3f08822012-05-29 12:57:52 +02001525void
1526_PyUnicode_FastCopyCharacters(
1527 PyObject *to, Py_ssize_t to_start,
1528 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529{
1530 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1531}
1532
1533Py_ssize_t
1534PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1535 PyObject *from, Py_ssize_t from_start,
1536 Py_ssize_t how_many)
1537{
1538 int err;
1539
1540 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1541 PyErr_BadInternalCall();
1542 return -1;
1543 }
1544
Benjamin Petersonbac79492012-01-14 13:34:47 -05001545 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001547 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 return -1;
1549
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001550 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001551 PyErr_SetString(PyExc_IndexError, "string index out of range");
1552 return -1;
1553 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001554 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001555 PyErr_SetString(PyExc_IndexError, "string index out of range");
1556 return -1;
1557 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001558 if (how_many < 0) {
1559 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1560 return -1;
1561 }
1562 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1564 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001565 "Cannot write %zi characters at %zi "
1566 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001567 how_many, to_start, PyUnicode_GET_LENGTH(to));
1568 return -1;
1569 }
1570
1571 if (how_many == 0)
1572 return 0;
1573
Victor Stinner488fa492011-12-12 00:01:39 +01001574 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001575 return -1;
1576
1577 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1578 if (err) {
1579 PyErr_Format(PyExc_SystemError,
1580 "Cannot copy %s characters "
1581 "into a string of %s characters",
1582 unicode_kind_name(from),
1583 unicode_kind_name(to));
1584 return -1;
1585 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001586 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587}
1588
Victor Stinner17222162011-09-28 22:15:37 +02001589/* Find the maximum code point and count the number of surrogate pairs so a
1590 correct string length can be computed before converting a string to UCS4.
1591 This function counts single surrogates as a character and not as a pair.
1592
1593 Return 0 on success, or -1 on error. */
1594static int
1595find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1596 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597{
1598 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001599 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600
Victor Stinnerc53be962011-10-02 21:33:54 +02001601 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 *num_surrogates = 0;
1603 *maxchar = 0;
1604
1605 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001607 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1608 && (iter+1) < end
1609 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1610 {
1611 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1612 ++(*num_surrogates);
1613 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 }
1615 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001617 {
1618 ch = *iter;
1619 iter++;
1620 }
1621 if (ch > *maxchar) {
1622 *maxchar = ch;
1623 if (*maxchar > MAX_UNICODE) {
1624 PyErr_Format(PyExc_ValueError,
1625 "character U+%x is not in range [U+0000; U+10ffff]",
1626 ch);
1627 return -1;
1628 }
1629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 }
1631 return 0;
1632}
1633
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001634int
1635_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636{
1637 wchar_t *end;
1638 Py_UCS4 maxchar = 0;
1639 Py_ssize_t num_surrogates;
1640#if SIZEOF_WCHAR_T == 2
1641 Py_ssize_t length_wo_surrogates;
1642#endif
1643
Georg Brandl7597add2011-10-05 16:36:47 +02001644 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001645 strings were created using _PyObject_New() and where no canonical
1646 representation (the str field) has been set yet aka strings
1647 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001648 assert(_PyUnicode_CHECK(unicode));
1649 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001651 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001652 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001653 /* Actually, it should neither be interned nor be anything else: */
1654 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001657 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001658 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660
1661 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001662 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1663 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 PyErr_NoMemory();
1665 return -1;
1666 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001667 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 _PyUnicode_WSTR(unicode), end,
1669 PyUnicode_1BYTE_DATA(unicode));
1670 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1671 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1672 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1673 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001674 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001675 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001676 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 }
1678 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001679 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001680 _PyUnicode_UTF8(unicode) = NULL;
1681 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 }
1683 PyObject_FREE(_PyUnicode_WSTR(unicode));
1684 _PyUnicode_WSTR(unicode) = NULL;
1685 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1686 }
1687 /* In this case we might have to convert down from 4-byte native
1688 wchar_t to 2-byte unicode. */
1689 else if (maxchar < 65536) {
1690 assert(num_surrogates == 0 &&
1691 "FindMaxCharAndNumSurrogatePairs() messed up");
1692
Victor Stinner506f5922011-09-28 22:34:18 +02001693#if SIZEOF_WCHAR_T == 2
1694 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001695 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001696 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1697 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1698 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001699 _PyUnicode_UTF8(unicode) = NULL;
1700 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001701#else
1702 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001703 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001704 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001705 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001706 PyErr_NoMemory();
1707 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708 }
Victor Stinner506f5922011-09-28 22:34:18 +02001709 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_2BYTE_DATA(unicode));
1712 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001715 _PyUnicode_UTF8(unicode) = NULL;
1716 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001717 PyObject_FREE(_PyUnicode_WSTR(unicode));
1718 _PyUnicode_WSTR(unicode) = NULL;
1719 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1720#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 }
1722 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1723 else {
1724#if SIZEOF_WCHAR_T == 2
1725 /* in case the native representation is 2-bytes, we need to allocate a
1726 new normalized 4-byte version. */
1727 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001728 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1729 PyErr_NoMemory();
1730 return -1;
1731 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001732 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1733 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 PyErr_NoMemory();
1735 return -1;
1736 }
1737 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1738 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001739 _PyUnicode_UTF8(unicode) = NULL;
1740 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001741 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1742 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001743 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 PyObject_FREE(_PyUnicode_WSTR(unicode));
1745 _PyUnicode_WSTR(unicode) = NULL;
1746 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1747#else
1748 assert(num_surrogates == 0);
1749
Victor Stinnerc3c74152011-10-02 20:39:55 +02001750 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001752 _PyUnicode_UTF8(unicode) = NULL;
1753 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1755#endif
1756 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1757 }
1758 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001759 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return 0;
1761}
1762
Alexander Belopolsky40018472011-02-26 01:02:56 +00001763static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001764unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765{
Walter Dörwald16807132007-05-25 13:52:07 +00001766 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 case SSTATE_NOT_INTERNED:
1768 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001769
Benjamin Peterson29060642009-01-31 22:14:21 +00001770 case SSTATE_INTERNED_MORTAL:
1771 /* revive dead object temporarily for DelItem */
1772 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001773 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001774 Py_FatalError(
1775 "deletion of interned string failed");
1776 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 case SSTATE_INTERNED_IMMORTAL:
1779 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001780 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001781
Benjamin Peterson29060642009-01-31 22:14:21 +00001782 default:
1783 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001784 }
1785
Victor Stinner03490912011-10-03 23:45:12 +02001786 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001788 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001789 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001790 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1791 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001793 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794}
1795
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001796#ifdef Py_DEBUG
1797static int
1798unicode_is_singleton(PyObject *unicode)
1799{
1800 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1801 if (unicode == unicode_empty)
1802 return 1;
1803 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1804 {
1805 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1806 if (ch < 256 && unicode_latin1[ch] == unicode)
1807 return 1;
1808 }
1809 return 0;
1810}
1811#endif
1812
Alexander Belopolsky40018472011-02-26 01:02:56 +00001813static int
Victor Stinner488fa492011-12-12 00:01:39 +01001814unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001815{
Victor Stinner488fa492011-12-12 00:01:39 +01001816 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (Py_REFCNT(unicode) != 1)
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (_PyUnicode_HASH(unicode) != -1)
1820 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001821 if (PyUnicode_CHECK_INTERNED(unicode))
1822 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001823 if (!PyUnicode_CheckExact(unicode))
1824 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001825#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001826 /* singleton refcount is greater than 1 */
1827 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001828#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001829 return 1;
1830}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831
Victor Stinnerfe226c02011-10-03 03:52:20 +02001832static int
1833unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1834{
1835 PyObject *unicode;
1836 Py_ssize_t old_length;
1837
1838 assert(p_unicode != NULL);
1839 unicode = *p_unicode;
1840
1841 assert(unicode != NULL);
1842 assert(PyUnicode_Check(unicode));
1843 assert(0 <= length);
1844
Victor Stinner910337b2011-10-03 03:20:16 +02001845 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001846 old_length = PyUnicode_WSTR_LENGTH(unicode);
1847 else
1848 old_length = PyUnicode_GET_LENGTH(unicode);
1849 if (old_length == length)
1850 return 0;
1851
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001852 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001853 _Py_INCREF_UNICODE_EMPTY();
1854 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001855 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001856 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001857 return 0;
1858 }
1859
Victor Stinner488fa492011-12-12 00:01:39 +01001860 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001861 PyObject *copy = resize_copy(unicode, length);
1862 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001864 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001866 }
1867
Victor Stinnerfe226c02011-10-03 03:52:20 +02001868 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 PyObject *new_unicode = resize_compact(unicode, length);
1870 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001872 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001873 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001874 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001875 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001876}
1877
Alexander Belopolsky40018472011-02-26 01:02:56 +00001878int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001879PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001880{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001881 PyObject *unicode;
1882 if (p_unicode == NULL) {
1883 PyErr_BadInternalCall();
1884 return -1;
1885 }
1886 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001887 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 {
1889 PyErr_BadInternalCall();
1890 return -1;
1891 }
1892 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001893}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001895/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001896
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001897 WARNING: The function doesn't copy the terminating null character and
1898 doesn't check the maximum character (may write a latin1 character in an
1899 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001900static void
1901unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1902 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001903{
1904 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1905 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001906 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001907
1908 switch (kind) {
1909 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001910 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001911#ifdef Py_DEBUG
1912 if (PyUnicode_IS_ASCII(unicode)) {
1913 Py_UCS4 maxchar = ucs1lib_find_max_char(
1914 (const Py_UCS1*)str,
1915 (const Py_UCS1*)str + len);
1916 assert(maxchar < 128);
1917 }
1918#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001919 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001920 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001921 }
1922 case PyUnicode_2BYTE_KIND: {
1923 Py_UCS2 *start = (Py_UCS2 *)data + index;
1924 Py_UCS2 *ucs2 = start;
1925 assert(index <= PyUnicode_GET_LENGTH(unicode));
1926
Victor Stinner184252a2012-06-16 02:57:41 +02001927 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 *ucs2 = (Py_UCS2)*str;
1929
1930 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001931 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001932 }
1933 default: {
1934 Py_UCS4 *start = (Py_UCS4 *)data + index;
1935 Py_UCS4 *ucs4 = start;
1936 assert(kind == PyUnicode_4BYTE_KIND);
1937 assert(index <= PyUnicode_GET_LENGTH(unicode));
1938
Victor Stinner184252a2012-06-16 02:57:41 +02001939 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001940 *ucs4 = (Py_UCS4)*str;
1941
1942 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001943 }
1944 }
1945}
1946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947static PyObject*
1948get_latin1_char(unsigned char ch)
1949{
Victor Stinnera464fc12011-10-02 20:39:30 +02001950 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001952 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 if (!unicode)
1954 return NULL;
1955 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001956 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 unicode_latin1[ch] = unicode;
1958 }
1959 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001960 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961}
1962
Victor Stinner985a82a2014-01-03 12:53:47 +01001963static PyObject*
1964unicode_char(Py_UCS4 ch)
1965{
1966 PyObject *unicode;
1967
1968 assert(ch <= MAX_UNICODE);
1969
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001970 if (ch < 256)
1971 return get_latin1_char(ch);
1972
Victor Stinner985a82a2014-01-03 12:53:47 +01001973 unicode = PyUnicode_New(1, ch);
1974 if (unicode == NULL)
1975 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001976
1977 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1978 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001979 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001980 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001981 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1982 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1983 }
1984 assert(_PyUnicode_CheckConsistency(unicode, 1));
1985 return unicode;
1986}
1987
Alexander Belopolsky40018472011-02-26 01:02:56 +00001988PyObject *
1989PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02001991 if (u == NULL)
1992 return (PyObject*)_PyUnicode_New(size);
1993
1994 if (size < 0) {
1995 PyErr_BadInternalCall();
1996 return NULL;
1997 }
1998
1999 return PyUnicode_FromWideChar(u, size);
2000}
2001
2002PyObject *
2003PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2004{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002005 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 Py_UCS4 maxchar = 0;
2007 Py_ssize_t num_surrogates;
2008
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002009 if (u == NULL && size != 0) {
2010 PyErr_BadInternalCall();
2011 return NULL;
2012 }
2013
2014 if (size == -1) {
2015 size = wcslen(u);
2016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002018 /* If the Unicode data is known at construction time, we can apply
2019 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002022 if (size == 0)
2023 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 /* Single character Unicode objects in the Latin-1 range are
2026 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002027 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return get_latin1_char((unsigned char)*u);
2029
2030 /* If not empty and not single character, copy the Unicode data
2031 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002032 if (find_maxchar_surrogates(u, u + size,
2033 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 return NULL;
2035
Victor Stinner8faf8212011-12-08 22:14:11 +01002036 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 if (!unicode)
2038 return NULL;
2039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 switch (PyUnicode_KIND(unicode)) {
2041 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002042 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2044 break;
2045 case PyUnicode_2BYTE_KIND:
2046#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002047 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002049 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2051#endif
2052 break;
2053 case PyUnicode_4BYTE_KIND:
2054#if SIZEOF_WCHAR_T == 2
2055 /* This is the only case which has to process surrogates, thus
2056 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002057 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058#else
2059 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002060 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061#endif
2062 break;
2063 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002064 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002067 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068}
2069
Alexander Belopolsky40018472011-02-26 01:02:56 +00002070PyObject *
2071PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002072{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002073 if (size < 0) {
2074 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002075 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002076 return NULL;
2077 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002078 if (u != NULL)
2079 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2080 else
2081 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002082}
2083
Alexander Belopolsky40018472011-02-26 01:02:56 +00002084PyObject *
2085PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002086{
2087 size_t size = strlen(u);
2088 if (size > PY_SSIZE_T_MAX) {
2089 PyErr_SetString(PyExc_OverflowError, "input too long");
2090 return NULL;
2091 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002092 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002093}
2094
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002095PyObject *
2096_PyUnicode_FromId(_Py_Identifier *id)
2097{
2098 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002099 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2100 strlen(id->string),
2101 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102 if (!id->object)
2103 return NULL;
2104 PyUnicode_InternInPlace(&id->object);
2105 assert(!id->next);
2106 id->next = static_strings;
2107 static_strings = id;
2108 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002109 return id->object;
2110}
2111
2112void
2113_PyUnicode_ClearStaticStrings()
2114{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002115 _Py_Identifier *tmp, *s = static_strings;
2116 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002117 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002118 tmp = s->next;
2119 s->next = NULL;
2120 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002121 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002122 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002123}
2124
Benjamin Peterson0df54292012-03-26 14:50:32 -04002125/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002126
Victor Stinnerd3f08822012-05-29 12:57:52 +02002127PyObject*
2128_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002129{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002130 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002131 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002132 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002133#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002134 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002135#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002136 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002137 }
Victor Stinner785938e2011-12-11 20:09:03 +01002138 unicode = PyUnicode_New(size, 127);
2139 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002140 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002141 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2142 assert(_PyUnicode_CheckConsistency(unicode, 1));
2143 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002144}
2145
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002146static Py_UCS4
2147kind_maxchar_limit(unsigned int kind)
2148{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002149 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002150 case PyUnicode_1BYTE_KIND:
2151 return 0x80;
2152 case PyUnicode_2BYTE_KIND:
2153 return 0x100;
2154 case PyUnicode_4BYTE_KIND:
2155 return 0x10000;
2156 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002157 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002158 }
2159}
2160
Victor Stinner702c7342011-10-05 13:50:52 +02002161static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002162_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002165 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002166
Serhiy Storchaka678db842013-01-26 12:16:36 +02002167 if (size == 0)
2168 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002169 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002170 if (size == 1)
2171 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002172
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002173 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002174 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 if (!res)
2176 return NULL;
2177 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002178 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002180}
2181
Victor Stinnere57b1c02011-09-28 22:20:48 +02002182static PyObject*
2183_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184{
2185 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002187
Serhiy Storchaka678db842013-01-26 12:16:36 +02002188 if (size == 0)
2189 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002190 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002191 if (size == 1)
2192 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002193
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002194 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002195 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 if (!res)
2197 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002198 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002200 else {
2201 _PyUnicode_CONVERT_BYTES(
2202 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2203 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002204 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 return res;
2206}
2207
Victor Stinnere57b1c02011-09-28 22:20:48 +02002208static PyObject*
2209_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210{
2211 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002213
Serhiy Storchaka678db842013-01-26 12:16:36 +02002214 if (size == 0)
2215 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002216 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002217 if (size == 1)
2218 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002219
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002220 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002221 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 if (!res)
2223 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002224 if (max_char < 256)
2225 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2226 PyUnicode_1BYTE_DATA(res));
2227 else if (max_char < 0x10000)
2228 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2229 PyUnicode_2BYTE_DATA(res));
2230 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002232 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 return res;
2234}
2235
2236PyObject*
2237PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2238{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002239 if (size < 0) {
2240 PyErr_SetString(PyExc_ValueError, "size must be positive");
2241 return NULL;
2242 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002243 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002245 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002247 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002249 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002250 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251 PyErr_SetString(PyExc_SystemError, "invalid kind");
2252 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254}
2255
Victor Stinnerece58de2012-04-23 23:36:38 +02002256Py_UCS4
2257_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2258{
2259 enum PyUnicode_Kind kind;
2260 void *startptr, *endptr;
2261
2262 assert(PyUnicode_IS_READY(unicode));
2263 assert(0 <= start);
2264 assert(end <= PyUnicode_GET_LENGTH(unicode));
2265 assert(start <= end);
2266
2267 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2268 return PyUnicode_MAX_CHAR_VALUE(unicode);
2269
2270 if (start == end)
2271 return 127;
2272
Victor Stinner94d558b2012-04-27 22:26:58 +02002273 if (PyUnicode_IS_ASCII(unicode))
2274 return 127;
2275
Victor Stinnerece58de2012-04-23 23:36:38 +02002276 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002277 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002278 endptr = (char *)startptr + end * kind;
2279 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002280 switch(kind) {
2281 case PyUnicode_1BYTE_KIND:
2282 return ucs1lib_find_max_char(startptr, endptr);
2283 case PyUnicode_2BYTE_KIND:
2284 return ucs2lib_find_max_char(startptr, endptr);
2285 case PyUnicode_4BYTE_KIND:
2286 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002287 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002288 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002289 }
2290}
2291
Victor Stinner25a4b292011-10-06 12:31:55 +02002292/* Ensure that a string uses the most efficient storage, if it is not the
2293 case: create a new string with of the right kind. Write NULL into *p_unicode
2294 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002295static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002296unicode_adjust_maxchar(PyObject **p_unicode)
2297{
2298 PyObject *unicode, *copy;
2299 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002300 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002301 unsigned int kind;
2302
2303 assert(p_unicode != NULL);
2304 unicode = *p_unicode;
2305 assert(PyUnicode_IS_READY(unicode));
2306 if (PyUnicode_IS_ASCII(unicode))
2307 return;
2308
2309 len = PyUnicode_GET_LENGTH(unicode);
2310 kind = PyUnicode_KIND(unicode);
2311 if (kind == PyUnicode_1BYTE_KIND) {
2312 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs1lib_find_max_char(u, u + len);
2314 if (max_char >= 128)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else if (kind == PyUnicode_2BYTE_KIND) {
2318 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 max_char = ucs2lib_find_max_char(u, u + len);
2320 if (max_char >= 256)
2321 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002322 }
2323 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002324 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002325 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002326 max_char = ucs4lib_find_max_char(u, u + len);
2327 if (max_char >= 0x10000)
2328 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002329 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002330 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002331 if (copy != NULL)
2332 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002333 Py_DECREF(unicode);
2334 *p_unicode = copy;
2335}
2336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002338_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002339{
Victor Stinner87af4f22011-11-21 23:03:47 +01002340 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002341 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002342
Victor Stinner034f6cf2011-09-30 02:26:44 +02002343 if (!PyUnicode_Check(unicode)) {
2344 PyErr_BadInternalCall();
2345 return NULL;
2346 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002347 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002348 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 length = PyUnicode_GET_LENGTH(unicode);
2351 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002352 if (!copy)
2353 return NULL;
2354 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2355
Christian Heimesf051e432016-09-13 20:22:02 +02002356 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002357 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002358 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002359 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002360}
2361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363/* Widen Unicode objects to larger buffers. Don't write terminating null
2364 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365
2366void*
2367_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2368{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002369 Py_ssize_t len;
2370 void *result;
2371 unsigned int skind;
2372
Benjamin Petersonbac79492012-01-14 13:34:47 -05002373 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002374 return NULL;
2375
2376 len = PyUnicode_GET_LENGTH(s);
2377 skind = PyUnicode_KIND(s);
2378 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002379 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return NULL;
2381 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002382 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002383 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002384 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002385 if (!result)
2386 return PyErr_NoMemory();
2387 assert(skind == PyUnicode_1BYTE_KIND);
2388 _PyUnicode_CONVERT_BYTES(
2389 Py_UCS1, Py_UCS2,
2390 PyUnicode_1BYTE_DATA(s),
2391 PyUnicode_1BYTE_DATA(s) + len,
2392 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002394 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002395 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002396 if (!result)
2397 return PyErr_NoMemory();
2398 if (skind == PyUnicode_2BYTE_KIND) {
2399 _PyUnicode_CONVERT_BYTES(
2400 Py_UCS2, Py_UCS4,
2401 PyUnicode_2BYTE_DATA(s),
2402 PyUnicode_2BYTE_DATA(s) + len,
2403 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405 else {
2406 assert(skind == PyUnicode_1BYTE_KIND);
2407 _PyUnicode_CONVERT_BYTES(
2408 Py_UCS1, Py_UCS4,
2409 PyUnicode_1BYTE_DATA(s),
2410 PyUnicode_1BYTE_DATA(s) + len,
2411 result);
2412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 default:
2415 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 }
Victor Stinner01698042011-10-04 00:04:26 +02002417 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 return NULL;
2419}
2420
2421static Py_UCS4*
2422as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2423 int copy_null)
2424{
2425 int kind;
2426 void *data;
2427 Py_ssize_t len, targetlen;
2428 if (PyUnicode_READY(string) == -1)
2429 return NULL;
2430 kind = PyUnicode_KIND(string);
2431 data = PyUnicode_DATA(string);
2432 len = PyUnicode_GET_LENGTH(string);
2433 targetlen = len;
2434 if (copy_null)
2435 targetlen++;
2436 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002437 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 if (!target) {
2439 PyErr_NoMemory();
2440 return NULL;
2441 }
2442 }
2443 else {
2444 if (targetsize < targetlen) {
2445 PyErr_Format(PyExc_SystemError,
2446 "string is longer than the buffer");
2447 if (copy_null && 0 < targetsize)
2448 target[0] = 0;
2449 return NULL;
2450 }
2451 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002452 if (kind == PyUnicode_1BYTE_KIND) {
2453 Py_UCS1 *start = (Py_UCS1 *) data;
2454 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002456 else if (kind == PyUnicode_2BYTE_KIND) {
2457 Py_UCS2 *start = (Py_UCS2 *) data;
2458 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2459 }
2460 else {
2461 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002462 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (copy_null)
2465 target[len] = 0;
2466 return target;
2467}
2468
2469Py_UCS4*
2470PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2471 int copy_null)
2472{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002473 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 PyErr_BadInternalCall();
2475 return NULL;
2476 }
2477 return as_ucs4(string, target, targetsize, copy_null);
2478}
2479
2480Py_UCS4*
2481PyUnicode_AsUCS4Copy(PyObject *string)
2482{
2483 return as_ucs4(string, NULL, 0, 1);
2484}
2485
Victor Stinner15a11362012-10-06 23:48:20 +02002486/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002487 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2488 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2489#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002490
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002491static int
2492unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2493 Py_ssize_t width, Py_ssize_t precision)
2494{
2495 Py_ssize_t length, fill, arglen;
2496 Py_UCS4 maxchar;
2497
2498 if (PyUnicode_READY(str) == -1)
2499 return -1;
2500
2501 length = PyUnicode_GET_LENGTH(str);
2502 if ((precision == -1 || precision >= length)
2503 && width <= length)
2504 return _PyUnicodeWriter_WriteStr(writer, str);
2505
2506 if (precision != -1)
2507 length = Py_MIN(precision, length);
2508
2509 arglen = Py_MAX(length, width);
2510 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2511 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2512 else
2513 maxchar = writer->maxchar;
2514
2515 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2516 return -1;
2517
2518 if (width > length) {
2519 fill = width - length;
2520 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2521 return -1;
2522 writer->pos += fill;
2523 }
2524
2525 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2526 str, 0, length);
2527 writer->pos += length;
2528 return 0;
2529}
2530
2531static int
Victor Stinner886483e2018-09-07 18:00:58 +02002532unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533 Py_ssize_t width, Py_ssize_t precision)
2534{
2535 /* UTF-8 */
2536 Py_ssize_t length;
2537 PyObject *unicode;
2538 int res;
2539
2540 length = strlen(str);
2541 if (precision != -1)
2542 length = Py_MIN(length, precision);
2543 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2544 if (unicode == NULL)
2545 return -1;
2546
2547 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2548 Py_DECREF(unicode);
2549 return res;
2550}
2551
Victor Stinner96865452011-03-01 23:44:09 +00002552static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002553unicode_fromformat_arg(_PyUnicodeWriter *writer,
2554 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002555{
Victor Stinnere215d962012-10-06 23:03:36 +02002556 const char *p;
2557 Py_ssize_t len;
2558 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002559 Py_ssize_t width;
2560 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002561 int longflag;
2562 int longlongflag;
2563 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002564 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002565
2566 p = f;
2567 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002568 zeropad = 0;
2569 if (*f == '0') {
2570 zeropad = 1;
2571 f++;
2572 }
Victor Stinner96865452011-03-01 23:44:09 +00002573
2574 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 width = -1;
2576 if (Py_ISDIGIT((unsigned)*f)) {
2577 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002578 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002579 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002581 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002582 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002583 return NULL;
2584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002586 f++;
2587 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 }
2589 precision = -1;
2590 if (*f == '.') {
2591 f++;
2592 if (Py_ISDIGIT((unsigned)*f)) {
2593 precision = (*f - '0');
2594 f++;
2595 while (Py_ISDIGIT((unsigned)*f)) {
2596 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2597 PyErr_SetString(PyExc_ValueError,
2598 "precision too big");
2599 return NULL;
2600 }
2601 precision = (precision * 10) + (*f - '0');
2602 f++;
2603 }
2604 }
Victor Stinner96865452011-03-01 23:44:09 +00002605 if (*f == '%') {
2606 /* "%.3%s" => f points to "3" */
2607 f--;
2608 }
2609 }
2610 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002611 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002612 f--;
2613 }
Victor Stinner96865452011-03-01 23:44:09 +00002614
2615 /* Handle %ld, %lu, %lld and %llu. */
2616 longflag = 0;
2617 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002618 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002619 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002620 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002621 longflag = 1;
2622 ++f;
2623 }
Victor Stinner96865452011-03-01 23:44:09 +00002624 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002625 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002626 longlongflag = 1;
2627 f += 2;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629 }
2630 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002631 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002632 size_tflag = 1;
2633 ++f;
2634 }
Victor Stinnere215d962012-10-06 23:03:36 +02002635
2636 if (f[1] == '\0')
2637 writer->overallocate = 0;
2638
2639 switch (*f) {
2640 case 'c':
2641 {
2642 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002643 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002644 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002645 "character argument not in range(0x110000)");
2646 return NULL;
2647 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002648 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002649 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002650 break;
2651 }
2652
2653 case 'i':
2654 case 'd':
2655 case 'u':
2656 case 'x':
2657 {
2658 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002659 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002661
2662 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002663 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002664 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002665 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002666 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002667 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002668 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002669 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002670 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002671 va_arg(*vargs, size_t));
2672 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002673 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002674 va_arg(*vargs, unsigned int));
2675 }
2676 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002677 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002678 }
2679 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002680 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002682 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002683 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002684 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002685 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002686 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002687 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002688 va_arg(*vargs, Py_ssize_t));
2689 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002690 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002691 va_arg(*vargs, int));
2692 }
2693 assert(len >= 0);
2694
Victor Stinnere215d962012-10-06 23:03:36 +02002695 if (precision < len)
2696 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002697
2698 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002699 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2700 return NULL;
2701
Victor Stinnere215d962012-10-06 23:03:36 +02002702 if (width > precision) {
2703 Py_UCS4 fillchar;
2704 fill = width - precision;
2705 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002706 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2707 return NULL;
2708 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002709 }
Victor Stinner15a11362012-10-06 23:48:20 +02002710 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002712 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2713 return NULL;
2714 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002715 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002716
Victor Stinner4a587072013-11-19 12:54:53 +01002717 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2718 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002719 break;
2720 }
2721
2722 case 'p':
2723 {
2724 char number[MAX_LONG_LONG_CHARS];
2725
2726 len = sprintf(number, "%p", va_arg(*vargs, void*));
2727 assert(len >= 0);
2728
2729 /* %p is ill-defined: ensure leading 0x. */
2730 if (number[1] == 'X')
2731 number[1] = 'x';
2732 else if (number[1] != 'x') {
2733 memmove(number + 2, number,
2734 strlen(number) + 1);
2735 number[0] = '0';
2736 number[1] = 'x';
2737 len += 2;
2738 }
2739
Victor Stinner4a587072013-11-19 12:54:53 +01002740 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002741 return NULL;
2742 break;
2743 }
2744
2745 case 's':
2746 {
2747 /* UTF-8 */
2748 const char *s = va_arg(*vargs, const char*);
Victor Stinner886483e2018-09-07 18:00:58 +02002749 if (unicode_fromformat_write_utf8(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002750 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002751 break;
2752 }
2753
2754 case 'U':
2755 {
2756 PyObject *obj = va_arg(*vargs, PyObject *);
2757 assert(obj && _PyUnicode_CHECK(obj));
2758
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002759 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002760 return NULL;
2761 break;
2762 }
2763
2764 case 'V':
2765 {
2766 PyObject *obj = va_arg(*vargs, PyObject *);
2767 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002768 if (obj) {
2769 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002770 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002771 return NULL;
2772 }
2773 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002774 assert(str != NULL);
Victor Stinner886483e2018-09-07 18:00:58 +02002775 if (unicode_fromformat_write_utf8(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002776 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002777 }
2778 break;
2779 }
2780
2781 case 'S':
2782 {
2783 PyObject *obj = va_arg(*vargs, PyObject *);
2784 PyObject *str;
2785 assert(obj);
2786 str = PyObject_Str(obj);
2787 if (!str)
2788 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002790 Py_DECREF(str);
2791 return NULL;
2792 }
2793 Py_DECREF(str);
2794 break;
2795 }
2796
2797 case 'R':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 PyObject *repr;
2801 assert(obj);
2802 repr = PyObject_Repr(obj);
2803 if (!repr)
2804 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002806 Py_DECREF(repr);
2807 return NULL;
2808 }
2809 Py_DECREF(repr);
2810 break;
2811 }
2812
2813 case 'A':
2814 {
2815 PyObject *obj = va_arg(*vargs, PyObject *);
2816 PyObject *ascii;
2817 assert(obj);
2818 ascii = PyObject_ASCII(obj);
2819 if (!ascii)
2820 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002821 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 Py_DECREF(ascii);
2823 return NULL;
2824 }
2825 Py_DECREF(ascii);
2826 break;
2827 }
2828
Victor Stinner886483e2018-09-07 18:00:58 +02002829 case 'T':
2830 {
2831 /* Object type name (tp_name) */
2832 PyObject *obj = va_arg(*vargs, PyObject *);
2833 PyTypeObject *type = Py_TYPE(obj);
2834 const char *type_name = type->tp_name;
2835 if (unicode_fromformat_write_utf8(writer, type_name, -1, -1) < 0) {
2836 return NULL;
2837 }
2838 break;
2839 }
Victor Stinnere215d962012-10-06 23:03:36 +02002840 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002841 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002842 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002843 break;
2844
2845 default:
2846 /* if we stumble upon an unknown formatting code, copy the rest
2847 of the format string to the output string. (we cannot just
2848 skip the code, since there's no way to know what's in the
2849 argument list) */
2850 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002851 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002852 return NULL;
2853 f = p+len;
2854 return f;
2855 }
2856
2857 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002858 return f;
2859}
2860
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861PyObject *
2862PyUnicode_FromFormatV(const char *format, va_list vargs)
2863{
Victor Stinnere215d962012-10-06 23:03:36 +02002864 va_list vargs2;
2865 const char *f;
2866 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002867
Victor Stinner8f674cc2013-04-17 23:02:17 +02002868 _PyUnicodeWriter_Init(&writer);
2869 writer.min_length = strlen(format) + 100;
2870 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002871
Benjamin Peterson0c212142016-09-20 20:39:33 -07002872 // Copy varags to be able to pass a reference to a subfunction.
2873 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002874
2875 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002876 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002877 f = unicode_fromformat_arg(&writer, f, &vargs2);
2878 if (f == NULL)
2879 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002881 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 const char *p;
2883 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002884
Victor Stinnere215d962012-10-06 23:03:36 +02002885 p = f;
2886 do
2887 {
2888 if ((unsigned char)*p > 127) {
2889 PyErr_Format(PyExc_ValueError,
2890 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2891 "string, got a non-ASCII byte: 0x%02x",
2892 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002893 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002894 }
2895 p++;
2896 }
2897 while (*p != '\0' && *p != '%');
2898 len = p - f;
2899
2900 if (*p == '\0')
2901 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002902
2903 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002904 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002905
2906 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002908 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002909 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002910 return _PyUnicodeWriter_Finish(&writer);
2911
2912 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002913 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002914 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002916}
2917
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918PyObject *
2919PyUnicode_FromFormat(const char *format, ...)
2920{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002921 PyObject* ret;
2922 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002923
2924#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002927 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002928#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 ret = PyUnicode_FromFormatV(format, vargs);
2930 va_end(vargs);
2931 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932}
2933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002934#ifdef HAVE_WCHAR_H
2935
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002936/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002937
Victor Stinnerd88d9832011-09-06 02:00:05 +02002938 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002939 character) required to convert the unicode object. Ignore size argument.
2940
Victor Stinnerd88d9832011-09-06 02:00:05 +02002941 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002942 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002943 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002944Py_ssize_t
2945PyUnicode_AsWideChar(PyObject *unicode,
2946 wchar_t *w,
2947 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002948{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002950 const wchar_t *wstr;
2951
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002952 if (unicode == NULL) {
2953 PyErr_BadInternalCall();
2954 return -1;
2955 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 if (wstr == NULL)
2958 return -1;
2959
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (size > res)
2962 size = res + 1;
2963 else
2964 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002965 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 return res;
2967 }
2968 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002970}
2971
Victor Stinner137c34c2010-09-29 10:25:54 +00002972wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002973PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002974 Py_ssize_t *size)
2975{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002976 const wchar_t *wstr;
2977 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002978 Py_ssize_t buflen;
2979
2980 if (unicode == NULL) {
2981 PyErr_BadInternalCall();
2982 return NULL;
2983 }
2984
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002985 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
2986 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002987 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002988 }
2989 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
2990 PyErr_SetString(PyExc_ValueError,
2991 "embedded null character");
2992 return NULL;
2993 }
2994
2995 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00002996 if (buffer == NULL) {
2997 PyErr_NoMemory();
2998 return NULL;
2999 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003000 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003001 if (size != NULL)
3002 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003003 return buffer;
3004}
3005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003006#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007
Alexander Belopolsky40018472011-02-26 01:02:56 +00003008PyObject *
3009PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003010{
Victor Stinner8faf8212011-12-08 22:14:11 +01003011 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 PyErr_SetString(PyExc_ValueError,
3013 "chr() arg not in range(0x110000)");
3014 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003015 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003016
Victor Stinner985a82a2014-01-03 12:53:47 +01003017 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003018}
3019
Alexander Belopolsky40018472011-02-26 01:02:56 +00003020PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003021PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003023 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003024 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003025 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003026 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003027 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 Py_INCREF(obj);
3029 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003030 }
3031 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 /* For a Unicode subtype that's not a Unicode object,
3033 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003034 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003035 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003036 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003037 "Can't convert '%T' object to str implicitly", obj);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003038 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003039}
3040
Alexander Belopolsky40018472011-02-26 01:02:56 +00003041PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003042PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003043 const char *encoding,
3044 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003045{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003046 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003047 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 PyErr_BadInternalCall();
3051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003053
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 /* Decoding bytes objects is the most common case and should be fast */
3055 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003056 if (PyBytes_GET_SIZE(obj) == 0)
3057 _Py_RETURN_UNICODE_EMPTY();
3058 v = PyUnicode_Decode(
3059 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3060 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 return v;
3062 }
3063
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003064 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_SetString(PyExc_TypeError,
3066 "decoding str is not supported");
3067 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3071 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3072 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003073 "decoding to str: need a bytes-like object, %T found",
3074 obj);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003075 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003076 }
Tim Petersced69f82003-09-16 20:30:58 +00003077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003079 PyBuffer_Release(&buffer);
3080 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003082
Serhiy Storchaka05997252013-01-26 12:14:02 +02003083 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003084 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003085 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086}
3087
Victor Stinnerebe17e02016-10-12 13:57:45 +02003088/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3089 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3090 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003091int
3092_Py_normalize_encoding(const char *encoding,
3093 char *lower,
3094 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003096 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003097 char *l;
3098 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003099 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100
Victor Stinner942889a2016-09-05 15:40:10 -07003101 assert(encoding != NULL);
3102
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003103 e = encoding;
3104 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003105 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003106 punct = 0;
3107 while (1) {
3108 char c = *e;
3109 if (c == 0) {
3110 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 }
Victor Stinner942889a2016-09-05 15:40:10 -07003112
3113 if (Py_ISALNUM(c) || c == '.') {
3114 if (punct && l != lower) {
3115 if (l == l_end) {
3116 return 0;
3117 }
3118 *l++ = '_';
3119 }
3120 punct = 0;
3121
3122 if (l == l_end) {
3123 return 0;
3124 }
3125 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 }
3127 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003128 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003129 }
Victor Stinner942889a2016-09-05 15:40:10 -07003130
3131 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003132 }
3133 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003134 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003135}
3136
Alexander Belopolsky40018472011-02-26 01:02:56 +00003137PyObject *
3138PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003139 Py_ssize_t size,
3140 const char *encoding,
3141 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003142{
3143 PyObject *buffer = NULL, *unicode;
3144 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003145 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3146
3147 if (encoding == NULL) {
3148 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3149 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003150
Fred Drakee4315f52000-05-09 19:53:39 +00003151 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003152 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3153 char *lower = buflower;
3154
3155 /* Fast paths */
3156 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3157 lower += 3;
3158 if (*lower == '_') {
3159 /* Match "utf8" and "utf_8" */
3160 lower++;
3161 }
3162
3163 if (lower[0] == '8' && lower[1] == 0) {
3164 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3165 }
3166 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3167 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3168 }
3169 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3170 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3171 }
3172 }
3173 else {
3174 if (strcmp(lower, "ascii") == 0
3175 || strcmp(lower, "us_ascii") == 0) {
3176 return PyUnicode_DecodeASCII(s, size, errors);
3177 }
Steve Dowercc16be82016-09-08 10:35:16 -07003178 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003179 else if (strcmp(lower, "mbcs") == 0) {
3180 return PyUnicode_DecodeMBCS(s, size, errors);
3181 }
3182 #endif
3183 else if (strcmp(lower, "latin1") == 0
3184 || strcmp(lower, "latin_1") == 0
3185 || strcmp(lower, "iso_8859_1") == 0
3186 || strcmp(lower, "iso8859_1") == 0) {
3187 return PyUnicode_DecodeLatin1(s, size, errors);
3188 }
3189 }
Victor Stinner37296e82010-06-10 13:36:23 +00003190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191
3192 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003193 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003194 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003195 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003196 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 if (buffer == NULL)
3198 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003199 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 if (unicode == NULL)
3201 goto onError;
3202 if (!PyUnicode_Check(unicode)) {
3203 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003204 "'%.400s' decoder returned '%T' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003205 "use codecs.decode() to decode to arbitrary types",
Victor Stinner886483e2018-09-07 18:00:58 +02003206 encoding, unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 Py_DECREF(unicode);
3208 goto onError;
3209 }
3210 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003211 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003212
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 Py_XDECREF(buffer);
3215 return NULL;
3216}
3217
Alexander Belopolsky40018472011-02-26 01:02:56 +00003218PyObject *
3219PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003220 const char *encoding,
3221 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003222{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003225 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003226 }
3227
Serhiy Storchaka00939072016-10-27 21:05:49 +03003228 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3229 "PyUnicode_AsDecodedObject() is deprecated; "
3230 "use PyCodec_Decode() to decode from str", 1) < 0)
3231 return NULL;
3232
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003233 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003235
3236 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003237 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238}
3239
Alexander Belopolsky40018472011-02-26 01:02:56 +00003240PyObject *
3241PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003242 const char *encoding,
3243 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003244{
3245 PyObject *v;
3246
3247 if (!PyUnicode_Check(unicode)) {
3248 PyErr_BadArgument();
3249 goto onError;
3250 }
3251
Serhiy Storchaka00939072016-10-27 21:05:49 +03003252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3253 "PyUnicode_AsDecodedUnicode() is deprecated; "
3254 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3255 return NULL;
3256
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003257 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003259
3260 /* Decode via the codec registry */
3261 v = PyCodec_Decode(unicode, encoding, errors);
3262 if (v == NULL)
3263 goto onError;
3264 if (!PyUnicode_Check(v)) {
3265 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003266 "'%.400s' decoder returned '%T' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003267 "use codecs.decode() to decode to arbitrary types",
Victor Stinner886483e2018-09-07 18:00:58 +02003268 encoding, unicode);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003269 Py_DECREF(v);
3270 goto onError;
3271 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003272 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003273
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003275 return NULL;
3276}
3277
Alexander Belopolsky40018472011-02-26 01:02:56 +00003278PyObject *
3279PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003280 Py_ssize_t size,
3281 const char *encoding,
3282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283{
3284 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003285
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003286 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3290 Py_DECREF(unicode);
3291 return v;
3292}
3293
Alexander Belopolsky40018472011-02-26 01:02:56 +00003294PyObject *
3295PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003296 const char *encoding,
3297 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003298{
3299 PyObject *v;
3300
3301 if (!PyUnicode_Check(unicode)) {
3302 PyErr_BadArgument();
3303 goto onError;
3304 }
3305
Serhiy Storchaka00939072016-10-27 21:05:49 +03003306 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3307 "PyUnicode_AsEncodedObject() is deprecated; "
3308 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3309 "or PyCodec_Encode() for generic encoding", 1) < 0)
3310 return NULL;
3311
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003312 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003314
3315 /* Encode via the codec registry */
3316 v = PyCodec_Encode(unicode, encoding, errors);
3317 if (v == NULL)
3318 goto onError;
3319 return v;
3320
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003322 return NULL;
3323}
3324
Victor Stinner1b579672011-12-17 05:47:23 +01003325
Victor Stinner2cba6b82018-01-10 22:46:15 +01003326static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003327unicode_encode_locale(PyObject *unicode, const char *errors,
3328 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003329{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003330 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003331
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003332 Py_ssize_t wlen;
3333 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3334 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003335 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003336 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003337
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003338 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339 if (wlen2 != wlen) {
3340 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003341 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342 return NULL;
3343 }
3344
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003345 char *str;
3346 size_t error_pos;
3347 const char *reason;
3348 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003349 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003350 if (res != 0) {
3351 if (res == -2) {
3352 PyObject *exc;
3353 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3354 "locale", unicode,
3355 (Py_ssize_t)error_pos,
3356 (Py_ssize_t)(error_pos+1),
3357 reason);
3358 if (exc != NULL) {
3359 PyCodec_StrictErrors(exc);
3360 Py_DECREF(exc);
3361 }
3362 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003363 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003364 else if (res == -3) {
3365 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3366 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003367 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003368 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369 PyMem_Free(wstr);
3370 return NULL;
3371 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003373 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003374
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003375 PyObject *bytes = PyBytes_FromString(str);
3376 PyMem_RawFree(str);
3377 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378}
3379
Victor Stinnerad158722010-10-27 00:25:46 +00003380PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003381PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3382{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003383 return unicode_encode_locale(unicode, errors, 1);
3384}
3385
3386PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003387PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003388{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003389 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003390 const _PyCoreConfig *config = &interp->core_config;
3391#if defined(__APPLE__)
3392 return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
3393#else
Victor Stinner793b5312011-04-27 00:24:21 +02003394 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3395 cannot use it to encode and decode filenames before it is loaded. Load
3396 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003397 implementation of the locale codec until the codec registry is
3398 initialized and the Python codec is loaded. See initfsencoding(). */
3399 if (interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003400 return PyUnicode_AsEncodedString(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003401 config->filesystem_encoding,
3402 config->filesystem_errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003403 }
3404 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003405 return unicode_encode_locale(unicode,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003406 config->filesystem_errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003407 }
Victor Stinnerad158722010-10-27 00:25:46 +00003408#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003409}
3410
Alexander Belopolsky40018472011-02-26 01:02:56 +00003411PyObject *
3412PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003413 const char *encoding,
3414 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415{
3416 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003417 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003418
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 if (!PyUnicode_Check(unicode)) {
3420 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 }
Fred Drakee4315f52000-05-09 19:53:39 +00003423
Victor Stinner942889a2016-09-05 15:40:10 -07003424 if (encoding == NULL) {
3425 return _PyUnicode_AsUTF8String(unicode, errors);
3426 }
3427
Fred Drakee4315f52000-05-09 19:53:39 +00003428 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003429 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3430 char *lower = buflower;
3431
3432 /* Fast paths */
3433 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3434 lower += 3;
3435 if (*lower == '_') {
3436 /* Match "utf8" and "utf_8" */
3437 lower++;
3438 }
3439
3440 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003441 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003442 }
3443 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3444 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3445 }
3446 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3447 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3448 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003449 }
Victor Stinner942889a2016-09-05 15:40:10 -07003450 else {
3451 if (strcmp(lower, "ascii") == 0
3452 || strcmp(lower, "us_ascii") == 0) {
3453 return _PyUnicode_AsASCIIString(unicode, errors);
3454 }
Steve Dowercc16be82016-09-08 10:35:16 -07003455#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003456 else if (strcmp(lower, "mbcs") == 0) {
3457 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3458 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003459#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003460 else if (strcmp(lower, "latin1") == 0 ||
3461 strcmp(lower, "latin_1") == 0 ||
3462 strcmp(lower, "iso_8859_1") == 0 ||
3463 strcmp(lower, "iso8859_1") == 0) {
3464 return _PyUnicode_AsLatin1String(unicode, errors);
3465 }
3466 }
Victor Stinner37296e82010-06-10 13:36:23 +00003467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468
3469 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003470 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003472 return NULL;
3473
3474 /* The normal path */
3475 if (PyBytes_Check(v))
3476 return v;
3477
3478 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003479 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003480 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003481 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003482
3483 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003484 "encoder %s returned bytearray instead of bytes; "
3485 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003486 encoding);
3487 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003488 Py_DECREF(v);
3489 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003490 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003492 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3493 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003494 Py_DECREF(v);
3495 return b;
3496 }
3497
3498 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003499 "'%.400s' encoder returned '%T' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003500 "use codecs.encode() to encode to arbitrary types",
Victor Stinner886483e2018-09-07 18:00:58 +02003501 encoding, v);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003502 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003503 return NULL;
3504}
3505
Alexander Belopolsky40018472011-02-26 01:02:56 +00003506PyObject *
3507PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003508 const char *encoding,
3509 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003510{
3511 PyObject *v;
3512
3513 if (!PyUnicode_Check(unicode)) {
3514 PyErr_BadArgument();
3515 goto onError;
3516 }
3517
Serhiy Storchaka00939072016-10-27 21:05:49 +03003518 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3519 "PyUnicode_AsEncodedUnicode() is deprecated; "
3520 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3521 return NULL;
3522
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003523 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003524 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003525
3526 /* Encode via the codec registry */
3527 v = PyCodec_Encode(unicode, encoding, errors);
3528 if (v == NULL)
3529 goto onError;
3530 if (!PyUnicode_Check(v)) {
3531 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003532 "'%.400s' encoder returned '%T' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003533 "use codecs.encode() to encode to arbitrary types",
Victor Stinner886483e2018-09-07 18:00:58 +02003534 encoding, v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 Py_DECREF(v);
3536 goto onError;
3537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003539
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541 return NULL;
3542}
3543
Victor Stinner2cba6b82018-01-10 22:46:15 +01003544static PyObject*
3545unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3546 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003547{
Victor Stinner3d4226a2018-08-29 22:21:32 +02003548 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003549
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003550 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3551 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003552 return NULL;
3553 }
3554
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003555 wchar_t *wstr;
3556 size_t wlen;
3557 const char *reason;
3558 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003559 current_locale, error_handler);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003560 if (res != 0) {
3561 if (res == -2) {
3562 PyObject *exc;
3563 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3564 "locale", str, len,
3565 (Py_ssize_t)wlen,
3566 (Py_ssize_t)(wlen + 1),
3567 reason);
3568 if (exc != NULL) {
3569 PyCodec_StrictErrors(exc);
3570 Py_DECREF(exc);
3571 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003572 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003573 else if (res == -3) {
3574 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3575 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003576 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003577 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003578 }
Victor Stinner2f197072011-12-17 07:08:30 +01003579 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003580 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003581
3582 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3583 PyMem_RawFree(wstr);
3584 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003585}
3586
3587PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003588PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3589 const char *errors)
3590{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003591 return unicode_decode_locale(str, len, errors, 1);
3592}
3593
3594PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003595PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003596{
3597 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003598 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003599}
3600
3601
3602PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003603PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003604 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003605 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3606}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003607
Christian Heimes5894ba72007-11-04 11:43:14 +00003608PyObject*
3609PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3610{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003611 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003612 const _PyCoreConfig *config = &interp->core_config;
3613#if defined(__APPLE__)
3614 return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
3615#else
Victor Stinner793b5312011-04-27 00:24:21 +02003616 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3617 cannot use it to encode and decode filenames before it is loaded. Load
3618 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003619 implementation of the locale codec until the codec registry is
3620 initialized and the Python codec is loaded. See initfsencoding(). */
3621 if (interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003622 return PyUnicode_Decode(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003623 config->filesystem_encoding,
3624 config->filesystem_errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003625 }
3626 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003627 return unicode_decode_locale(s, size,
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003628 config->filesystem_errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003629 }
Victor Stinnerad158722010-10-27 00:25:46 +00003630#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003631}
3632
Martin v. Löwis011e8422009-05-05 04:43:17 +00003633
3634int
3635PyUnicode_FSConverter(PyObject* arg, void* addr)
3636{
Brett Cannonec6ce872016-09-06 15:50:29 -07003637 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638 PyObject *output = NULL;
3639 Py_ssize_t size;
3640 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003641 if (arg == NULL) {
3642 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003643 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003644 return 1;
3645 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003646 path = PyOS_FSPath(arg);
3647 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003648 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003649 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003650 if (PyBytes_Check(path)) {
3651 output = path;
3652 }
3653 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3654 output = PyUnicode_EncodeFSDefault(path);
3655 Py_DECREF(path);
3656 if (!output) {
3657 return 0;
3658 }
3659 assert(PyBytes_Check(output));
3660 }
3661
Victor Stinner0ea2a462010-04-30 00:22:08 +00003662 size = PyBytes_GET_SIZE(output);
3663 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003664 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003665 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003666 Py_DECREF(output);
3667 return 0;
3668 }
3669 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003670 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003671}
3672
3673
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003674int
3675PyUnicode_FSDecoder(PyObject* arg, void* addr)
3676{
Brett Cannona5711202016-09-06 19:36:01 -07003677 int is_buffer = 0;
3678 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003679 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003680 if (arg == NULL) {
3681 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003682 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003683 return 1;
3684 }
Brett Cannona5711202016-09-06 19:36:01 -07003685
3686 is_buffer = PyObject_CheckBuffer(arg);
3687 if (!is_buffer) {
3688 path = PyOS_FSPath(arg);
3689 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003690 return 0;
3691 }
Brett Cannona5711202016-09-06 19:36:01 -07003692 }
3693 else {
3694 path = arg;
3695 Py_INCREF(arg);
3696 }
3697
3698 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003699 output = path;
3700 }
3701 else if (PyBytes_Check(path) || is_buffer) {
3702 PyObject *path_bytes = NULL;
3703
3704 if (!PyBytes_Check(path) &&
3705 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner886483e2018-09-07 18:00:58 +02003706 "path should be string, bytes, "
3707 "or os.PathLike, not %T",
3708 arg))
3709 {
3710 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003712 }
3713 path_bytes = PyBytes_FromObject(path);
3714 Py_DECREF(path);
3715 if (!path_bytes) {
3716 return 0;
3717 }
3718 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3719 PyBytes_GET_SIZE(path_bytes));
3720 Py_DECREF(path_bytes);
3721 if (!output) {
3722 return 0;
3723 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003724 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003725 else {
3726 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02003727 "path should be string, bytes, or os.PathLike, not %T",
3728 arg);
Brett Cannona5711202016-09-06 19:36:01 -07003729 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003730 return 0;
3731 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003732 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003733 Py_DECREF(output);
3734 return 0;
3735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003737 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003738 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003739 Py_DECREF(output);
3740 return 0;
3741 }
3742 *(PyObject**)addr = output;
3743 return Py_CLEANUP_SUPPORTED;
3744}
3745
3746
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003747const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003748PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003749{
Christian Heimesf3863112007-11-22 07:46:41 +00003750 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003752 if (!PyUnicode_Check(unicode)) {
3753 PyErr_BadArgument();
3754 return NULL;
3755 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003756 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003757 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003759 if (PyUnicode_UTF8(unicode) == NULL) {
3760 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003761 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 if (bytes == NULL)
3763 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003764 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3765 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003766 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 Py_DECREF(bytes);
3768 return NULL;
3769 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003770 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003771 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 PyBytes_AS_STRING(bytes),
3773 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 Py_DECREF(bytes);
3775 }
3776
3777 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003778 *psize = PyUnicode_UTF8_LENGTH(unicode);
3779 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003780}
3781
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003782const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3786}
3787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788Py_UNICODE *
3789PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 const unsigned char *one_byte;
3792#if SIZEOF_WCHAR_T == 4
3793 const Py_UCS2 *two_bytes;
3794#else
3795 const Py_UCS4 *four_bytes;
3796 const Py_UCS4 *ucs4_end;
3797 Py_ssize_t num_surrogates;
3798#endif
3799 wchar_t *w;
3800 wchar_t *wchar_end;
3801
3802 if (!PyUnicode_Check(unicode)) {
3803 PyErr_BadArgument();
3804 return NULL;
3805 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003806 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 assert(_PyUnicode_KIND(unicode) != 0);
3809 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3814 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 num_surrogates = 0;
3816
3817 for (; four_bytes < ucs4_end; ++four_bytes) {
3818 if (*four_bytes > 0xFFFF)
3819 ++num_surrogates;
3820 }
3821
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003822 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3823 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3824 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 PyErr_NoMemory();
3826 return NULL;
3827 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 w = _PyUnicode_WSTR(unicode);
3831 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3832 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3834 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003835 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003837 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3838 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 }
3840 else
3841 *w = *four_bytes;
3842
3843 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07003844 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 }
3846 }
3847 *w = 0;
3848#else
3849 /* sizeof(wchar_t) == 4 */
3850 Py_FatalError("Impossible unicode object state, wstr and str "
3851 "should share memory already.");
3852 return NULL;
3853#endif
3854 }
3855 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003856 if ((size_t)_PyUnicode_LENGTH(unicode) >
3857 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3858 PyErr_NoMemory();
3859 return NULL;
3860 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003861 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3862 (_PyUnicode_LENGTH(unicode) + 1));
3863 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 PyErr_NoMemory();
3865 return NULL;
3866 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003867 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3868 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3869 w = _PyUnicode_WSTR(unicode);
3870 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3873 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 for (; w < wchar_end; ++one_byte, ++w)
3875 *w = *one_byte;
3876 /* null-terminate the wstr */
3877 *w = 0;
3878 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 for (; w < wchar_end; ++two_bytes, ++w)
3883 *w = *two_bytes;
3884 /* null-terminate the wstr */
3885 *w = 0;
3886#else
3887 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 PyObject_FREE(_PyUnicode_WSTR(unicode));
3889 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 Py_FatalError("Impossible unicode object state, wstr "
3891 "and str should share memory already.");
3892 return NULL;
3893#endif
3894 }
3895 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07003896 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 }
3898 }
3899 }
3900 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 *size = PyUnicode_WSTR_LENGTH(unicode);
3902 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003903}
3904
Alexander Belopolsky40018472011-02-26 01:02:56 +00003905Py_UNICODE *
3906PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909}
3910
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003911const Py_UNICODE *
3912_PyUnicode_AsUnicode(PyObject *unicode)
3913{
3914 Py_ssize_t size;
3915 const Py_UNICODE *wstr;
3916
3917 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3918 if (wstr && wcslen(wstr) != (size_t)size) {
3919 PyErr_SetString(PyExc_ValueError, "embedded null character");
3920 return NULL;
3921 }
3922 return wstr;
3923}
3924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925
Alexander Belopolsky40018472011-02-26 01:02:56 +00003926Py_ssize_t
3927PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928{
3929 if (!PyUnicode_Check(unicode)) {
3930 PyErr_BadArgument();
3931 goto onError;
3932 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003933 if (_PyUnicode_WSTR(unicode) == NULL) {
3934 if (PyUnicode_AsUnicode(unicode) == NULL)
3935 goto onError;
3936 }
3937 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 return -1;
3941}
3942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943Py_ssize_t
3944PyUnicode_GetLength(PyObject *unicode)
3945{
Victor Stinner07621332012-06-16 04:53:46 +02003946 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 PyErr_BadArgument();
3948 return -1;
3949 }
Victor Stinner07621332012-06-16 04:53:46 +02003950 if (PyUnicode_READY(unicode) == -1)
3951 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 return PyUnicode_GET_LENGTH(unicode);
3953}
3954
3955Py_UCS4
3956PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3957{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003958 void *data;
3959 int kind;
3960
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003961 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003962 PyErr_BadArgument();
3963 return (Py_UCS4)-1;
3964 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03003965 if (PyUnicode_READY(unicode) == -1) {
3966 return (Py_UCS4)-1;
3967 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003968 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003969 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 return (Py_UCS4)-1;
3971 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003972 data = PyUnicode_DATA(unicode);
3973 kind = PyUnicode_KIND(unicode);
3974 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975}
3976
3977int
3978PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3979{
3980 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003981 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 return -1;
3983 }
Victor Stinner488fa492011-12-12 00:01:39 +01003984 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003985 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003986 PyErr_SetString(PyExc_IndexError, "string index out of range");
3987 return -1;
3988 }
Victor Stinner488fa492011-12-12 00:01:39 +01003989 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003990 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003991 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3992 PyErr_SetString(PyExc_ValueError, "character out of range");
3993 return -1;
3994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3996 index, ch);
3997 return 0;
3998}
3999
Alexander Belopolsky40018472011-02-26 01:02:56 +00004000const char *
4001PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004002{
Victor Stinner42cb4622010-09-01 19:39:01 +00004003 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004004}
4005
Victor Stinner554f3f02010-06-16 23:33:54 +00004006/* create or adjust a UnicodeDecodeError */
4007static void
4008make_decode_exception(PyObject **exceptionObject,
4009 const char *encoding,
4010 const char *input, Py_ssize_t length,
4011 Py_ssize_t startpos, Py_ssize_t endpos,
4012 const char *reason)
4013{
4014 if (*exceptionObject == NULL) {
4015 *exceptionObject = PyUnicodeDecodeError_Create(
4016 encoding, input, length, startpos, endpos, reason);
4017 }
4018 else {
4019 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4020 goto onError;
4021 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4022 goto onError;
4023 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4024 goto onError;
4025 }
4026 return;
4027
4028onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004029 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004030}
4031
Steve Dowercc16be82016-09-08 10:35:16 -07004032#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033/* error handling callback helper:
4034 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004035 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 and adjust various state variables.
4037 return 0 on success, -1 on error
4038*/
4039
Alexander Belopolsky40018472011-02-26 01:02:56 +00004040static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004041unicode_decode_call_errorhandler_wchar(
4042 const char *errors, PyObject **errorHandler,
4043 const char *encoding, const char *reason,
4044 const char **input, const char **inend, Py_ssize_t *startinpos,
4045 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4046 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004048 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049
4050 PyObject *restuple = NULL;
4051 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004052 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004053 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004054 Py_ssize_t requiredsize;
4055 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004056 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004057 wchar_t *repwstr;
4058 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004060 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4061 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 *errorHandler = PyCodec_LookupError(errors);
4065 if (*errorHandler == NULL)
4066 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 }
4068
Victor Stinner554f3f02010-06-16 23:33:54 +00004069 make_decode_exception(exceptionObject,
4070 encoding,
4071 *input, *inend - *input,
4072 *startinpos, *endinpos,
4073 reason);
4074 if (*exceptionObject == NULL)
4075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004077 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004081 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004084 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004086
4087 /* Copy back the bytes variables, which might have been modified by the
4088 callback */
4089 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4090 if (!inputobj)
4091 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004092 *input = PyBytes_AS_STRING(inputobj);
4093 insize = PyBytes_GET_SIZE(inputobj);
4094 *inend = *input + insize;
4095 /* we can DECREF safely, as the exception has another reference,
4096 so the object won't go away. */
4097 Py_DECREF(inputobj);
4098
4099 if (newpos<0)
4100 newpos = insize+newpos;
4101 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004102 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004103 goto onError;
4104 }
4105
4106 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4107 if (repwstr == NULL)
4108 goto onError;
4109 /* need more space? (at least enough for what we
4110 have+the replacement+the rest of the string (starting
4111 at the new input position), so we won't have to check space
4112 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004113 requiredsize = *outpos;
4114 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4115 goto overflow;
4116 requiredsize += repwlen;
4117 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4118 goto overflow;
4119 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004120 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004121 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004122 requiredsize = 2*outsize;
4123 if (unicode_resize(output, requiredsize) < 0)
4124 goto onError;
4125 }
4126 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4127 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128 *endinpos = newpos;
4129 *inptr = *input + newpos;
4130
4131 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004132 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004133 return 0;
4134
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004135 overflow:
4136 PyErr_SetString(PyExc_OverflowError,
4137 "decoded result is too long for a Python string");
4138
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004139 onError:
4140 Py_XDECREF(restuple);
4141 return -1;
4142}
Steve Dowercc16be82016-09-08 10:35:16 -07004143#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144
4145static int
4146unicode_decode_call_errorhandler_writer(
4147 const char *errors, PyObject **errorHandler,
4148 const char *encoding, const char *reason,
4149 const char **input, const char **inend, Py_ssize_t *startinpos,
4150 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4151 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4152{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004153 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154
4155 PyObject *restuple = NULL;
4156 PyObject *repunicode = NULL;
4157 Py_ssize_t insize;
4158 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004159 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004160 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004161 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004162 int need_to_grow = 0;
4163 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004164
4165 if (*errorHandler == NULL) {
4166 *errorHandler = PyCodec_LookupError(errors);
4167 if (*errorHandler == NULL)
4168 goto onError;
4169 }
4170
4171 make_decode_exception(exceptionObject,
4172 encoding,
4173 *input, *inend - *input,
4174 *startinpos, *endinpos,
4175 reason);
4176 if (*exceptionObject == NULL)
4177 goto onError;
4178
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004179 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004180 if (restuple == NULL)
4181 goto onError;
4182 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004183 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004184 goto onError;
4185 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004186 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004187 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004188
4189 /* Copy back the bytes variables, which might have been modified by the
4190 callback */
4191 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4192 if (!inputobj)
4193 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004194 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004195 *input = PyBytes_AS_STRING(inputobj);
4196 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004198 /* we can DECREF safely, as the exception has another reference,
4199 so the object won't go away. */
4200 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004204 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004205 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004207 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208
Victor Stinner170ca6f2013-04-18 00:25:28 +02004209 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004210 if (replen > 1) {
4211 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004212 need_to_grow = 1;
4213 }
4214 new_inptr = *input + newpos;
4215 if (*inend - new_inptr > remain) {
4216 /* We don't know the decoding algorithm here so we make the worst
4217 assumption that one byte decodes to one unicode character.
4218 If unfortunately one byte could decode to more unicode characters,
4219 the decoder may write out-of-bound then. Is it possible for the
4220 algorithms using this function? */
4221 writer->min_length += *inend - new_inptr - remain;
4222 need_to_grow = 1;
4223 }
4224 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004225 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004226 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004227 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4228 goto onError;
4229 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004230 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004231 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004234 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004237 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004238 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243}
4244
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004245/* --- UTF-7 Codec -------------------------------------------------------- */
4246
Antoine Pitrou244651a2009-05-04 18:56:13 +00004247/* See RFC2152 for details. We encode conservatively and decode liberally. */
4248
4249/* Three simple macros defining base-64. */
4250
4251/* Is c a base-64 character? */
4252
4253#define IS_BASE64(c) \
4254 (((c) >= 'A' && (c) <= 'Z') || \
4255 ((c) >= 'a' && (c) <= 'z') || \
4256 ((c) >= '0' && (c) <= '9') || \
4257 (c) == '+' || (c) == '/')
4258
4259/* given that c is a base-64 character, what is its base-64 value? */
4260
4261#define FROM_BASE64(c) \
4262 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4263 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4264 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4265 (c) == '+' ? 62 : 63)
4266
4267/* What is the base-64 character of the bottom 6 bits of n? */
4268
4269#define TO_BASE64(n) \
4270 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4271
4272/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4273 * decoded as itself. We are permissive on decoding; the only ASCII
4274 * byte not decoding to itself is the + which begins a base64
4275 * string. */
4276
4277#define DECODE_DIRECT(c) \
4278 ((c) <= 127 && (c) != '+')
4279
4280/* The UTF-7 encoder treats ASCII characters differently according to
4281 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4282 * the above). See RFC2152. This array identifies these different
4283 * sets:
4284 * 0 : "Set D"
4285 * alphanumeric and '(),-./:?
4286 * 1 : "Set O"
4287 * !"#$%&*;<=>@[]^_`{|}
4288 * 2 : "whitespace"
4289 * ht nl cr sp
4290 * 3 : special (must be base64 encoded)
4291 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4292 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293
Tim Petersced69f82003-09-16 20:30:58 +00004294static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295char utf7_category[128] = {
4296/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4297 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4298/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4299 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4300/* sp ! " # $ % & ' ( ) * + , - . / */
4301 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4302/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4304/* @ A B C D E F G H I J K L M N O */
4305 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4306/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4308/* ` a b c d e f g h i j k l m n o */
4309 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4310/* p q r s t u v w x y z { | } ~ del */
4311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312};
4313
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314/* ENCODE_DIRECT: this character should be encoded as itself. The
4315 * answer depends on whether we are encoding set O as itself, and also
4316 * on whether we are encoding whitespace as itself. RFC2152 makes it
4317 * clear that the answers to these questions vary between
4318 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004319
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320#define ENCODE_DIRECT(c, directO, directWS) \
4321 ((c) < 128 && (c) > 0 && \
4322 ((utf7_category[(c)] == 0) || \
4323 (directWS && (utf7_category[(c)] == 2)) || \
4324 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325
Alexander Belopolsky40018472011-02-26 01:02:56 +00004326PyObject *
4327PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004328 Py_ssize_t size,
4329 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004331 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4332}
4333
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334/* The decoder. The only state we preserve is our read position,
4335 * i.e. how many characters we have consumed. So if we end in the
4336 * middle of a shift sequence we have to back off the read position
4337 * and the output to the beginning of the sequence, otherwise we lose
4338 * all the shift state (seen bits, number of bits seen, high
4339 * surrogate). */
4340
Alexander Belopolsky40018472011-02-26 01:02:56 +00004341PyObject *
4342PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004343 Py_ssize_t size,
4344 const char *errors,
4345 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004348 Py_ssize_t startinpos;
4349 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004352 const char *errmsg = "";
4353 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004354 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 unsigned int base64bits = 0;
4356 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004357 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 PyObject *errorHandler = NULL;
4359 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004361 if (size == 0) {
4362 if (consumed)
4363 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004364 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004365 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004368 _PyUnicodeWriter_Init(&writer);
4369 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370
4371 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 e = s + size;
4373
4374 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004375 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004377 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (inShift) { /* in a base-64 section */
4380 if (IS_BASE64(ch)) { /* consume a base-64 character */
4381 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4382 base64bits += 6;
4383 s++;
4384 if (base64bits >= 16) {
4385 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004386 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 base64bits -= 16;
4388 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004389 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 if (surrogate) {
4391 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004392 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4393 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004394 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004395 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004397 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 }
4399 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004400 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004401 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 }
4404 }
Victor Stinner551ac952011-11-29 22:58:13 +01004405 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 /* first surrogate */
4407 surrogate = outCh;
4408 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004410 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 }
4413 }
4414 }
4415 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 if (base64bits > 0) { /* left-over bits */
4418 if (base64bits >= 6) {
4419 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004420 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 errmsg = "partial character in shift sequence";
4422 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 else {
4425 /* Some bits remain; they should be zero */
4426 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004427 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 errmsg = "non-zero padding bits in shift sequence";
4429 goto utf7Error;
4430 }
4431 }
4432 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004433 if (surrogate && DECODE_DIRECT(ch)) {
4434 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4435 goto onError;
4436 }
4437 surrogate = 0;
4438 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 /* '-' is absorbed; other terminating
4440 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004441 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 }
4444 }
4445 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 s++; /* consume '+' */
4448 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004450 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004451 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004453 else if (s < e && !IS_BASE64(*s)) {
4454 s++;
4455 errmsg = "ill-formed sequence";
4456 goto utf7Error;
4457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004460 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004463 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 }
4465 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004468 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 else {
4472 startinpos = s-starts;
4473 s++;
4474 errmsg = "unexpected special character";
4475 goto utf7Error;
4476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004480 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 errors, &errorHandler,
4482 "utf7", errmsg,
4483 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 }
4487
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 /* end of string */
4489
4490 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4491 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004492 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 if (surrogate ||
4494 (base64bits >= 6) ||
4495 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 errors, &errorHandler,
4499 "utf7", "unterminated shift sequence",
4500 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004501 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 goto onError;
4503 if (s < e)
4504 goto restart;
4505 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507
4508 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004509 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004511 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004512 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004513 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004514 writer.kind, writer.data, shiftOutStart);
4515 Py_XDECREF(errorHandler);
4516 Py_XDECREF(exc);
4517 _PyUnicodeWriter_Dealloc(&writer);
4518 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004519 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004520 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 }
4522 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004523 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 Py_XDECREF(errorHandler);
4528 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 Py_XDECREF(errorHandler);
4533 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 return NULL;
4536}
4537
4538
Alexander Belopolsky40018472011-02-26 01:02:56 +00004539PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004540_PyUnicode_EncodeUTF7(PyObject *str,
4541 int base64SetO,
4542 int base64WhiteSpace,
4543 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004545 int kind;
4546 void *data;
4547 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004548 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004550 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 unsigned int base64bits = 0;
4552 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 char * out;
4554 char * start;
4555
Benjamin Petersonbac79492012-01-14 13:34:47 -05004556 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004557 return NULL;
4558 kind = PyUnicode_KIND(str);
4559 data = PyUnicode_DATA(str);
4560 len = PyUnicode_GET_LENGTH(str);
4561
4562 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004564
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004565 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004566 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004567 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004568 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569 if (v == NULL)
4570 return NULL;
4571
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004572 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004573 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004574 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 if (inShift) {
4577 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4578 /* shifting out */
4579 if (base64bits) { /* output remaining bits */
4580 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4581 base64buffer = 0;
4582 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583 }
4584 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 /* Characters not in the BASE64 set implicitly unshift the sequence
4586 so no '-' is required, except if the character is itself a '-' */
4587 if (IS_BASE64(ch) || ch == '-') {
4588 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 *out++ = (char) ch;
4591 }
4592 else {
4593 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004594 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004595 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 else { /* not in a shift sequence */
4597 if (ch == '+') {
4598 *out++ = '+';
4599 *out++ = '-';
4600 }
4601 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4602 *out++ = (char) ch;
4603 }
4604 else {
4605 *out++ = '+';
4606 inShift = 1;
4607 goto encode_char;
4608 }
4609 }
4610 continue;
4611encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004613 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004614
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 /* code first surrogate */
4616 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004617 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 while (base64bits >= 6) {
4619 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4620 base64bits -= 6;
4621 }
4622 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004623 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 base64bits += 16;
4626 base64buffer = (base64buffer << 16) | ch;
4627 while (base64bits >= 6) {
4628 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4629 base64bits -= 6;
4630 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004631 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 if (base64bits)
4633 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4634 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004636 if (_PyBytes_Resize(&v, out - start) < 0)
4637 return NULL;
4638 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004640PyObject *
4641PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4642 Py_ssize_t size,
4643 int base64SetO,
4644 int base64WhiteSpace,
4645 const char *errors)
4646{
4647 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004648 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004649 if (tmp == NULL)
4650 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004651 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652 base64WhiteSpace, errors);
4653 Py_DECREF(tmp);
4654 return result;
4655}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657#undef IS_BASE64
4658#undef FROM_BASE64
4659#undef TO_BASE64
4660#undef DECODE_DIRECT
4661#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663/* --- UTF-8 Codec -------------------------------------------------------- */
4664
Alexander Belopolsky40018472011-02-26 01:02:56 +00004665PyObject *
4666PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004667 Py_ssize_t size,
4668 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669{
Walter Dörwald69652032004-09-07 20:24:22 +00004670 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4671}
4672
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673#include "stringlib/asciilib.h"
4674#include "stringlib/codecs.h"
4675#include "stringlib/undef.h"
4676
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004677#include "stringlib/ucs1lib.h"
4678#include "stringlib/codecs.h"
4679#include "stringlib/undef.h"
4680
4681#include "stringlib/ucs2lib.h"
4682#include "stringlib/codecs.h"
4683#include "stringlib/undef.h"
4684
4685#include "stringlib/ucs4lib.h"
4686#include "stringlib/codecs.h"
4687#include "stringlib/undef.h"
4688
Antoine Pitrouab868312009-01-10 15:40:25 +00004689/* Mask to quickly check whether a C 'long' contains a
4690 non-ASCII, UTF8-encoded char. */
4691#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004692# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004693#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004694# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004695#else
4696# error C 'long' size should be either 4 or 8!
4697#endif
4698
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699static Py_ssize_t
4700ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004703 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004705 /*
4706 * Issue #17237: m68k is a bit different from most architectures in
4707 * that objects do not use "natural alignment" - for example, int and
4708 * long are only aligned at 2-byte boundaries. Therefore the assert()
4709 * won't work; also, tests have shown that skipping the "optimised
4710 * version" will even speed up m68k.
4711 */
4712#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004714 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4715 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 /* Fast path, see in STRINGLIB(utf8_decode) for
4717 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004718 /* Help allocation */
4719 const char *_p = p;
4720 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 while (_p < aligned_end) {
4722 unsigned long value = *(const unsigned long *) _p;
4723 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 *((unsigned long *)q) = value;
4726 _p += SIZEOF_LONG;
4727 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004728 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 p = _p;
4730 while (p < end) {
4731 if ((unsigned char)*p & 0x80)
4732 break;
4733 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004738#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 while (p < end) {
4740 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4741 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004742 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004743 /* Help allocation */
4744 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 while (_p < aligned_end) {
4746 unsigned long value = *(unsigned long *) _p;
4747 if (value & ASCII_CHAR_MASK)
4748 break;
4749 _p += SIZEOF_LONG;
4750 }
4751 p = _p;
4752 if (_p == end)
4753 break;
4754 }
4755 if ((unsigned char)*p & 0x80)
4756 break;
4757 ++p;
4758 }
4759 memcpy(dest, start, p - start);
4760 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761}
Antoine Pitrouab868312009-01-10 15:40:25 +00004762
Victor Stinner785938e2011-12-11 20:09:03 +01004763PyObject *
4764PyUnicode_DecodeUTF8Stateful(const char *s,
4765 Py_ssize_t size,
4766 const char *errors,
4767 Py_ssize_t *consumed)
4768{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004770 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772
4773 Py_ssize_t startinpos;
4774 Py_ssize_t endinpos;
4775 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004776 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004778 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004779
4780 if (size == 0) {
4781 if (consumed)
4782 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004783 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004784 }
4785
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4787 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004788 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 *consumed = 1;
4790 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004791 }
4792
Victor Stinner8f674cc2013-04-17 23:02:17 +02004793 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004794 writer.min_length = size;
4795 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004796 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004797
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 writer.pos = ascii_decode(s, end, writer.data);
4799 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 while (s < end) {
4801 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004803
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 if (PyUnicode_IS_ASCII(writer.buffer))
4806 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004810 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 } else {
4812 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004813 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 }
4815
4816 switch (ch) {
4817 case 0:
4818 if (s == end || consumed)
4819 goto End;
4820 errmsg = "unexpected end of data";
4821 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004822 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 break;
4824 case 1:
4825 errmsg = "invalid start byte";
4826 startinpos = s - starts;
4827 endinpos = startinpos + 1;
4828 break;
4829 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004830 case 3:
4831 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 errmsg = "invalid continuation byte";
4833 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004834 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 break;
4836 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004837 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 goto onError;
4839 continue;
4840 }
4841
Victor Stinner1d65d912015-10-05 13:43:50 +02004842 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004843 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004844
4845 switch (error_handler) {
4846 case _Py_ERROR_IGNORE:
4847 s += (endinpos - startinpos);
4848 break;
4849
4850 case _Py_ERROR_REPLACE:
4851 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4852 goto onError;
4853 s += (endinpos - startinpos);
4854 break;
4855
4856 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004857 {
4858 Py_ssize_t i;
4859
Victor Stinner1d65d912015-10-05 13:43:50 +02004860 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4861 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004862 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004863 ch = (Py_UCS4)(unsigned char)(starts[i]);
4864 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4865 ch + 0xdc00);
4866 writer.pos++;
4867 }
4868 s += (endinpos - startinpos);
4869 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004870 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004871
4872 default:
4873 if (unicode_decode_call_errorhandler_writer(
4874 errors, &error_handler_obj,
4875 "utf-8", errmsg,
4876 &starts, &end, &startinpos, &endinpos, &exc, &s,
4877 &writer))
4878 goto onError;
4879 }
Victor Stinner785938e2011-12-11 20:09:03 +01004880 }
4881
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883 if (consumed)
4884 *consumed = s - starts;
4885
Victor Stinner1d65d912015-10-05 13:43:50 +02004886 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004888 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889
4890onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004891 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004893 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004895}
4896
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004897
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004898/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4899 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004900
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004901 On success, write a pointer to a newly allocated wide character string into
4902 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4903 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004904
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004905 On memory allocation failure, return -1.
4906
4907 On decoding error (if surrogateescape is zero), return -2. If wlen is
4908 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4909 is not NULL, write the decoding error message into *reason. */
4910int
4911_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02004912 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004913{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004914 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004915 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 wchar_t *unicode;
4917 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004918
Victor Stinner3d4226a2018-08-29 22:21:32 +02004919 int surrogateescape = 0;
4920 int surrogatepass = 0;
4921 switch (errors)
4922 {
4923 case _Py_ERROR_STRICT:
4924 break;
4925 case _Py_ERROR_SURROGATEESCAPE:
4926 surrogateescape = 1;
4927 break;
4928 case _Py_ERROR_SURROGATEPASS:
4929 surrogatepass = 1;
4930 break;
4931 default:
4932 return -3;
4933 }
4934
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004935 /* Note: size will always be longer than the resulting Unicode
4936 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004937 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004938 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004939 }
4940
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004941 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004942 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004943 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004944 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004945
4946 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004947 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004949 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004951#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004953#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004955#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 if (ch > 0xFF) {
4957#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004958 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004960 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004961 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4963 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4964#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004965 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02004967 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004969 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004970
4971 if (surrogateescape) {
4972 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4973 }
4974 else {
4975 /* Is it a valid three-byte code? */
4976 if (surrogatepass
4977 && (e - s) >= 3
4978 && (s[0] & 0xf0) == 0xe0
4979 && (s[1] & 0xc0) == 0x80
4980 && (s[2] & 0xc0) == 0x80)
4981 {
4982 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4983 s += 3;
4984 unicode[outpos++] = ch;
4985 }
4986 else {
4987 PyMem_RawFree(unicode );
4988 if (reason != NULL) {
4989 switch (ch) {
4990 case 0:
4991 *reason = "unexpected end of data";
4992 break;
4993 case 1:
4994 *reason = "invalid start byte";
4995 break;
4996 /* 2, 3, 4 */
4997 default:
4998 *reason = "invalid continuation byte";
4999 break;
5000 }
5001 }
5002 if (wlen != NULL) {
5003 *wlen = s - orig_s;
5004 }
5005 return -2;
5006 }
5007 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005009 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005011 if (wlen) {
5012 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005013 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005014 *wstr = unicode;
5015 return 0;
5016}
5017
5018wchar_t*
5019_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5020{
5021 wchar_t *wstr;
5022 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5023 if (res != 0) {
5024 return NULL;
5025 }
5026 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027}
5028
Antoine Pitrouab868312009-01-10 15:40:25 +00005029
Victor Stinnere47e6982017-12-21 15:45:16 +01005030/* UTF-8 encoder using the surrogateescape error handler .
5031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On success, return 0 and write the newly allocated character string (use
5033 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005034
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005035 On encoding failure, return -2 and write the position of the invalid
5036 surrogate character into *error_pos (if error_pos is set) and the decoding
5037 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005038
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005039 On memory allocation failure, return -1. */
5040int
5041_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005042 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005043{
5044 const Py_ssize_t max_char_size = 4;
5045 Py_ssize_t len = wcslen(text);
5046
5047 assert(len >= 0);
5048
Victor Stinner3d4226a2018-08-29 22:21:32 +02005049 int surrogateescape = 0;
5050 int surrogatepass = 0;
5051 switch (errors)
5052 {
5053 case _Py_ERROR_STRICT:
5054 break;
5055 case _Py_ERROR_SURROGATEESCAPE:
5056 surrogateescape = 1;
5057 break;
5058 case _Py_ERROR_SURROGATEPASS:
5059 surrogatepass = 1;
5060 break;
5061 default:
5062 return -3;
5063 }
5064
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005065 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5066 return -1;
5067 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005068 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005069 if (raw_malloc) {
5070 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005071 }
5072 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005073 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005074 }
5075 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005077 }
5078
5079 char *p = bytes;
5080 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005081 for (i = 0; i < len; ) {
5082 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005083 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005084 i++;
5085#if Py_UNICODE_SIZE == 2
5086 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5087 && i < len
5088 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5089 {
5090 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5091 i++;
5092 }
5093#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005094
5095 if (ch < 0x80) {
5096 /* Encode ASCII */
5097 *p++ = (char) ch;
5098
5099 }
5100 else if (ch < 0x0800) {
5101 /* Encode Latin-1 */
5102 *p++ = (char)(0xc0 | (ch >> 6));
5103 *p++ = (char)(0x80 | (ch & 0x3f));
5104 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005105 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005106 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005108 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005109 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005110 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005111 if (reason != NULL) {
5112 *reason = "encoding error";
5113 }
5114 if (raw_malloc) {
5115 PyMem_RawFree(bytes);
5116 }
5117 else {
5118 PyMem_Free(bytes);
5119 }
5120 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005121 }
5122 *p++ = (char)(ch & 0xff);
5123 }
5124 else if (ch < 0x10000) {
5125 *p++ = (char)(0xe0 | (ch >> 12));
5126 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5127 *p++ = (char)(0x80 | (ch & 0x3f));
5128 }
5129 else { /* ch >= 0x10000 */
5130 assert(ch <= MAX_UNICODE);
5131 /* Encode UCS4 Unicode ordinals */
5132 *p++ = (char)(0xf0 | (ch >> 18));
5133 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5134 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5135 *p++ = (char)(0x80 | (ch & 0x3f));
5136 }
5137 }
5138 *p++ = '\0';
5139
5140 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005141 char *bytes2;
5142 if (raw_malloc) {
5143 bytes2 = PyMem_RawRealloc(bytes, final_size);
5144 }
5145 else {
5146 bytes2 = PyMem_Realloc(bytes, final_size);
5147 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005148 if (bytes2 == NULL) {
5149 if (error_pos != NULL) {
5150 *error_pos = (size_t)-1;
5151 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005152 if (raw_malloc) {
5153 PyMem_RawFree(bytes);
5154 }
5155 else {
5156 PyMem_Free(bytes);
5157 }
5158 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005159 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 *str = bytes2;
5161 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005162}
5163
5164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165/* Primary internal function which creates utf8 encoded bytes objects.
5166
5167 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005168 and allocate exactly as much space needed at the end. Else allocate the
5169 maximum possible needed (4 result bytes per Unicode character), and return
5170 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005171*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005172PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005173_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Victor Stinner6099a032011-12-18 14:22:26 +01005175 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176 void *data;
5177 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179 if (!PyUnicode_Check(unicode)) {
5180 PyErr_BadArgument();
5181 return NULL;
5182 }
5183
5184 if (PyUnicode_READY(unicode) == -1)
5185 return NULL;
5186
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005187 if (PyUnicode_UTF8(unicode))
5188 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5189 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005190
5191 kind = PyUnicode_KIND(unicode);
5192 data = PyUnicode_DATA(unicode);
5193 size = PyUnicode_GET_LENGTH(unicode);
5194
Benjamin Petersonead6b532011-12-20 17:23:42 -06005195 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005196 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005197 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005198 case PyUnicode_1BYTE_KIND:
5199 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5200 assert(!PyUnicode_IS_ASCII(unicode));
5201 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5202 case PyUnicode_2BYTE_KIND:
5203 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5204 case PyUnicode_4BYTE_KIND:
5205 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207}
5208
Alexander Belopolsky40018472011-02-26 01:02:56 +00005209PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005210PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5211 Py_ssize_t size,
5212 const char *errors)
5213{
5214 PyObject *v, *unicode;
5215
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005216 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005217 if (unicode == NULL)
5218 return NULL;
5219 v = _PyUnicode_AsUTF8String(unicode, errors);
5220 Py_DECREF(unicode);
5221 return v;
5222}
5223
5224PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005225PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005227 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228}
5229
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230/* --- UTF-32 Codec ------------------------------------------------------- */
5231
5232PyObject *
5233PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 Py_ssize_t size,
5235 const char *errors,
5236 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005237{
5238 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5239}
5240
5241PyObject *
5242PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 Py_ssize_t size,
5244 const char *errors,
5245 int *byteorder,
5246 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005247{
5248 const char *starts = s;
5249 Py_ssize_t startinpos;
5250 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005251 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005252 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005253 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005254 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256 PyObject *errorHandler = NULL;
5257 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005258
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259 q = (unsigned char *)s;
5260 e = q + size;
5261
5262 if (byteorder)
5263 bo = *byteorder;
5264
5265 /* Check for BOM marks (U+FEFF) in the input and adjust current
5266 byte order setting accordingly. In native mode, the leading BOM
5267 mark is skipped, in all other modes, it is copied to the output
5268 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005270 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 if (bom == 0x0000FEFF) {
5272 bo = -1;
5273 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005275 else if (bom == 0xFFFE0000) {
5276 bo = 1;
5277 q += 4;
5278 }
5279 if (byteorder)
5280 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005281 }
5282
Victor Stinnere64322e2012-10-30 23:12:47 +01005283 if (q == e) {
5284 if (consumed)
5285 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005286 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005287 }
5288
Victor Stinnere64322e2012-10-30 23:12:47 +01005289#ifdef WORDS_BIGENDIAN
5290 le = bo < 0;
5291#else
5292 le = bo <= 0;
5293#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005294 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005295
Victor Stinner8f674cc2013-04-17 23:02:17 +02005296 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005297 writer.min_length = (e - q + 3) / 4;
5298 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005300
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 while (1) {
5302 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005303 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005304
Victor Stinnere64322e2012-10-30 23:12:47 +01005305 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 enum PyUnicode_Kind kind = writer.kind;
5307 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005309 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005310 if (le) {
5311 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005312 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005313 if (ch > maxch)
5314 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 if (kind != PyUnicode_1BYTE_KIND &&
5316 Py_UNICODE_IS_SURROGATE(ch))
5317 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005319 q += 4;
5320 } while (q <= last);
5321 }
5322 else {
5323 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005324 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005325 if (ch > maxch)
5326 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 if (kind != PyUnicode_1BYTE_KIND &&
5328 Py_UNICODE_IS_SURROGATE(ch))
5329 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 q += 4;
5332 } while (q <= last);
5333 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005334 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 }
5336
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005337 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005338 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005339 startinpos = ((const char *)q) - starts;
5340 endinpos = startinpos + 4;
5341 }
5342 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005345 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005347 startinpos = ((const char *)q) - starts;
5348 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005350 else {
5351 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005353 goto onError;
5354 q += 4;
5355 continue;
5356 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005357 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005358 startinpos = ((const char *)q) - starts;
5359 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005361
5362 /* The remaining input chars are ignored if the callback
5363 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005366 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 }
5371
Walter Dörwald41980ca2007-08-16 21:55:45 +00005372 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375 Py_XDECREF(errorHandler);
5376 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005380 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381 Py_XDECREF(errorHandler);
5382 Py_XDECREF(exc);
5383 return NULL;
5384}
5385
5386PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005387_PyUnicode_EncodeUTF32(PyObject *str,
5388 const char *errors,
5389 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005390{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 enum PyUnicode_Kind kind;
5392 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005393 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005394 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005395 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005396#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005399 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005400#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005402 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005403 PyObject *errorHandler = NULL;
5404 PyObject *exc = NULL;
5405 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005406
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 if (!PyUnicode_Check(str)) {
5408 PyErr_BadArgument();
5409 return NULL;
5410 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005411 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005412 return NULL;
5413 kind = PyUnicode_KIND(str);
5414 data = PyUnicode_DATA(str);
5415 len = PyUnicode_GET_LENGTH(str);
5416
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005418 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005419 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005420 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421 if (v == NULL)
5422 return NULL;
5423
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 /* output buffer is 4-bytes aligned */
5425 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005426 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005432 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005433 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005436 else
5437 encoding = "utf-32";
5438
5439 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5441 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005442 }
5443
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005444 pos = 0;
5445 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005447
5448 if (kind == PyUnicode_2BYTE_KIND) {
5449 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5450 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005451 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005452 else {
5453 assert(kind == PyUnicode_4BYTE_KIND);
5454 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5455 &out, native_ordering);
5456 }
5457 if (pos == len)
5458 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005459
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 rep = unicode_encode_call_errorhandler(
5461 errors, &errorHandler,
5462 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005463 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 if (!rep)
5465 goto error;
5466
5467 if (PyBytes_Check(rep)) {
5468 repsize = PyBytes_GET_SIZE(rep);
5469 if (repsize & 3) {
5470 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005471 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005472 "surrogates not allowed");
5473 goto error;
5474 }
5475 moreunits = repsize / 4;
5476 }
5477 else {
5478 assert(PyUnicode_Check(rep));
5479 if (PyUnicode_READY(rep) < 0)
5480 goto error;
5481 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5482 if (!PyUnicode_IS_ASCII(rep)) {
5483 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005484 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 "surrogates not allowed");
5486 goto error;
5487 }
5488 }
5489
5490 /* four bytes are reserved for each surrogate */
5491 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005492 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005493 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 /* integer overflow */
5495 PyErr_NoMemory();
5496 goto error;
5497 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005498 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005500 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005501 }
5502
5503 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005504 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005505 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005508 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5509 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 }
5511
5512 Py_CLEAR(rep);
5513 }
5514
5515 /* Cut back to size actually needed. This is necessary for, for example,
5516 encoding of a string containing isolated surrogates and the 'ignore'
5517 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005518 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005519 if (nsize != PyBytes_GET_SIZE(v))
5520 _PyBytes_Resize(&v, nsize);
5521 Py_XDECREF(errorHandler);
5522 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005523 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005524 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005525 error:
5526 Py_XDECREF(rep);
5527 Py_XDECREF(errorHandler);
5528 Py_XDECREF(exc);
5529 Py_XDECREF(v);
5530 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005531}
5532
Alexander Belopolsky40018472011-02-26 01:02:56 +00005533PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005534PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5535 Py_ssize_t size,
5536 const char *errors,
5537 int byteorder)
5538{
5539 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005540 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005541 if (tmp == NULL)
5542 return NULL;
5543 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5544 Py_DECREF(tmp);
5545 return result;
5546}
5547
5548PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550{
Victor Stinnerb960b342011-11-20 19:12:52 +01005551 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005552}
5553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554/* --- UTF-16 Codec ------------------------------------------------------- */
5555
Tim Peters772747b2001-08-09 22:21:55 +00005556PyObject *
5557PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 Py_ssize_t size,
5559 const char *errors,
5560 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561{
Walter Dörwald69652032004-09-07 20:24:22 +00005562 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5563}
5564
5565PyObject *
5566PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 Py_ssize_t size,
5568 const char *errors,
5569 int *byteorder,
5570 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005571{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005573 Py_ssize_t startinpos;
5574 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005576 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005577 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005578 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005579 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 PyObject *errorHandler = NULL;
5581 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
Tim Peters772747b2001-08-09 22:21:55 +00005584 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005585 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
5587 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005588 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005590 /* Check for BOM marks (U+FEFF) in the input and adjust current
5591 byte order setting accordingly. In native mode, the leading BOM
5592 mark is skipped, in all other modes, it is copied to the output
5593 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005594 if (bo == 0 && size >= 2) {
5595 const Py_UCS4 bom = (q[1] << 8) | q[0];
5596 if (bom == 0xFEFF) {
5597 q += 2;
5598 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005600 else if (bom == 0xFFFE) {
5601 q += 2;
5602 bo = 1;
5603 }
5604 if (byteorder)
5605 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 if (q == e) {
5609 if (consumed)
5610 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005611 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005612 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613
Christian Heimes743e0cd2012-10-17 23:52:17 +02005614#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005615 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005617#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005619 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005620#endif
Tim Peters772747b2001-08-09 22:21:55 +00005621
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005623 character count normally. Error handler will take care of
5624 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005625 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005626 writer.min_length = (e - q + 1) / 2;
5627 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005629
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 while (1) {
5631 Py_UCS4 ch = 0;
5632 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
5639 else
5640 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 native_ordering);
5643 } else if (kind == PyUnicode_2BYTE_KIND) {
5644 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005646 native_ordering);
5647 } else {
5648 assert(kind == PyUnicode_4BYTE_KIND);
5649 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005650 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005652 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654
Antoine Pitrou63065d72012-05-15 23:48:04 +02005655 switch (ch)
5656 {
5657 case 0:
5658 /* remaining byte at the end? (size should be even) */
5659 if (q == e || consumed)
5660 goto End;
5661 errmsg = "truncated data";
5662 startinpos = ((const char *)q) - starts;
5663 endinpos = ((const char *)e) - starts;
5664 break;
5665 /* The remaining input chars are ignored if the callback
5666 chooses to skip the input */
5667 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005668 q -= 2;
5669 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005670 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005671 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005672 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005673 endinpos = ((const char *)e) - starts;
5674 break;
5675 case 2:
5676 errmsg = "illegal encoding";
5677 startinpos = ((const char *)q) - 2 - starts;
5678 endinpos = startinpos + 2;
5679 break;
5680 case 3:
5681 errmsg = "illegal UTF-16 surrogate";
5682 startinpos = ((const char *)q) - 4 - starts;
5683 endinpos = startinpos + 2;
5684 break;
5685 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005686 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 continue;
5689 }
5690
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005692 errors,
5693 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005694 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005695 &starts,
5696 (const char **)&e,
5697 &startinpos,
5698 &endinpos,
5699 &exc,
5700 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
5704
Antoine Pitrou63065d72012-05-15 23:48:04 +02005705End:
Walter Dörwald69652032004-09-07 20:24:22 +00005706 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 Py_XDECREF(errorHandler);
5710 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005711 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005714 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 Py_XDECREF(errorHandler);
5716 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 return NULL;
5718}
5719
Tim Peters772747b2001-08-09 22:21:55 +00005720PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721_PyUnicode_EncodeUTF16(PyObject *str,
5722 const char *errors,
5723 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 enum PyUnicode_Kind kind;
5726 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005727 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005728 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005729 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005731#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005732 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005733#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005734 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005735#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 const char *encoding;
5737 Py_ssize_t nsize, pos;
5738 PyObject *errorHandler = NULL;
5739 PyObject *exc = NULL;
5740 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005741
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 if (!PyUnicode_Check(str)) {
5743 PyErr_BadArgument();
5744 return NULL;
5745 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005746 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747 return NULL;
5748 kind = PyUnicode_KIND(str);
5749 data = PyUnicode_DATA(str);
5750 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005751
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005753 if (kind == PyUnicode_4BYTE_KIND) {
5754 const Py_UCS4 *in = (const Py_UCS4 *)data;
5755 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 while (in < end) {
5757 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005758 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005759 }
5760 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 nsize = len + pairs + (byteorder == 0);
5766 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005771 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005772 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005773 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005775 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
5777 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 }
Tim Peters772747b2001-08-09 22:21:55 +00005780
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 if (kind == PyUnicode_1BYTE_KIND) {
5782 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5783 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005784 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005785
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005786 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005788 }
5789 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005791 }
5792 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005794 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005795
5796 pos = 0;
5797 while (pos < len) {
5798 Py_ssize_t repsize, moreunits;
5799
5800 if (kind == PyUnicode_2BYTE_KIND) {
5801 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5802 &out, native_ordering);
5803 }
5804 else {
5805 assert(kind == PyUnicode_4BYTE_KIND);
5806 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5807 &out, native_ordering);
5808 }
5809 if (pos == len)
5810 break;
5811
5812 rep = unicode_encode_call_errorhandler(
5813 errors, &errorHandler,
5814 encoding, "surrogates not allowed",
5815 str, &exc, pos, pos + 1, &pos);
5816 if (!rep)
5817 goto error;
5818
5819 if (PyBytes_Check(rep)) {
5820 repsize = PyBytes_GET_SIZE(rep);
5821 if (repsize & 1) {
5822 raise_encode_exception(&exc, encoding,
5823 str, pos - 1, pos,
5824 "surrogates not allowed");
5825 goto error;
5826 }
5827 moreunits = repsize / 2;
5828 }
5829 else {
5830 assert(PyUnicode_Check(rep));
5831 if (PyUnicode_READY(rep) < 0)
5832 goto error;
5833 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5834 if (!PyUnicode_IS_ASCII(rep)) {
5835 raise_encode_exception(&exc, encoding,
5836 str, pos - 1, pos,
5837 "surrogates not allowed");
5838 goto error;
5839 }
5840 }
5841
5842 /* two bytes are reserved for each surrogate */
5843 if (moreunits > 1) {
5844 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005845 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 /* integer overflow */
5847 PyErr_NoMemory();
5848 goto error;
5849 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005850 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005851 goto error;
5852 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5853 }
5854
5855 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005856 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005857 out += moreunits;
5858 } else /* rep is unicode */ {
5859 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5860 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5861 &out, native_ordering);
5862 }
5863
5864 Py_CLEAR(rep);
5865 }
5866
5867 /* Cut back to size actually needed. This is necessary for, for example,
5868 encoding of a string containing isolated surrogates and the 'ignore' handler
5869 is used. */
5870 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5871 if (nsize != PyBytes_GET_SIZE(v))
5872 _PyBytes_Resize(&v, nsize);
5873 Py_XDECREF(errorHandler);
5874 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005875 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005876 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005877 error:
5878 Py_XDECREF(rep);
5879 Py_XDECREF(errorHandler);
5880 Py_XDECREF(exc);
5881 Py_XDECREF(v);
5882 return NULL;
5883#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
Alexander Belopolsky40018472011-02-26 01:02:56 +00005886PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5888 Py_ssize_t size,
5889 const char *errors,
5890 int byteorder)
5891{
5892 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005893 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 if (tmp == NULL)
5895 return NULL;
5896 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5897 Py_DECREF(tmp);
5898 return result;
5899}
5900
5901PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005902PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905}
5906
5907/* --- Unicode Escape Codec ----------------------------------------------- */
5908
Fredrik Lundh06d12682001-01-24 07:59:11 +00005909static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005910
Alexander Belopolsky40018472011-02-26 01:02:56 +00005911PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005912_PyUnicode_DecodeUnicodeEscape(const char *s,
5913 Py_ssize_t size,
5914 const char *errors,
5915 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005918 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 PyObject *errorHandler = NULL;
5921 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005922
Eric V. Smith42454af2016-10-31 09:22:08 -04005923 // so we can remember if we've seen an invalid escape char or not
5924 *first_invalid_escape = NULL;
5925
Victor Stinner62ec3312016-09-06 17:04:34 -07005926 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005927 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005928 }
5929 /* Escaped strings will always be longer than the resulting
5930 Unicode string, so we start with size here and then reduce the
5931 length after conversion to the true value.
5932 (but if the error callback returns a long replacement string
5933 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005934 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005935 writer.min_length = size;
5936 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5937 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005938 }
5939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 end = s + size;
5941 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005942 unsigned char c = (unsigned char) *s++;
5943 Py_UCS4 ch;
5944 int count;
5945 Py_ssize_t startinpos;
5946 Py_ssize_t endinpos;
5947 const char *message;
5948
5949#define WRITE_ASCII_CHAR(ch) \
5950 do { \
5951 assert(ch <= 127); \
5952 assert(writer.pos < writer.size); \
5953 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5954 } while(0)
5955
5956#define WRITE_CHAR(ch) \
5957 do { \
5958 if (ch <= writer.maxchar) { \
5959 assert(writer.pos < writer.size); \
5960 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5961 } \
5962 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5963 goto onError; \
5964 } \
5965 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
5967 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005968 if (c != '\\') {
5969 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 continue;
5971 }
5972
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005975 if (s >= end) {
5976 message = "\\ at end of string";
5977 goto error;
5978 }
5979 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005980
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 case '\n': continue;
5986 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5987 case '\'': WRITE_ASCII_CHAR('\''); continue;
5988 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5989 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5992 case 't': WRITE_ASCII_CHAR('\t'); continue;
5993 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5994 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005995 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005997 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 case '0': case '1': case '2': case '3':
6002 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006004 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 ch = (ch<<3) + *s++ - '0';
6006 if (s < end && '0' <= *s && *s <= '7') {
6007 ch = (ch<<3) + *s++ - '0';
6008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 WRITE_CHAR(ch);
6011 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 /* hex escapes */
6014 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006017 message = "truncated \\xXX escape";
6018 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006023 message = "truncated \\uXXXX escape";
6024 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006027 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006028 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006029 message = "truncated \\UXXXXXXXX escape";
6030 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006031 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006032 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006033 ch <<= 4;
6034 if (c >= '0' && c <= '9') {
6035 ch += c - '0';
6036 }
6037 else if (c >= 'a' && c <= 'f') {
6038 ch += c - ('a' - 10);
6039 }
6040 else if (c >= 'A' && c <= 'F') {
6041 ch += c - ('A' - 10);
6042 }
6043 else {
6044 break;
6045 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006046 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006048 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006049 }
6050
6051 /* when we get here, ch is a 32-bit unicode character */
6052 if (ch > MAX_UNICODE) {
6053 message = "illegal Unicode character";
6054 goto error;
6055 }
6056
6057 WRITE_CHAR(ch);
6058 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006061 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 if (ucnhash_CAPI == NULL) {
6063 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006064 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6065 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 if (ucnhash_CAPI == NULL) {
6067 PyErr_SetString(
6068 PyExc_UnicodeError,
6069 "\\N escapes not supported (can't load unicodedata module)"
6070 );
6071 goto onError;
6072 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006074
6075 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006076 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006077 const char *start = ++s;
6078 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006079 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 namelen = s - start;
6083 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006084 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006085 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 ch = 0xffffffff; /* in case 'getcode' messes up */
6087 if (namelen <= INT_MAX &&
6088 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6089 &ch, 0)) {
6090 assert(ch <= MAX_UNICODE);
6091 WRITE_CHAR(ch);
6092 continue;
6093 }
6094 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006095 }
6096 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006097 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006098
6099 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006100 if (*first_invalid_escape == NULL) {
6101 *first_invalid_escape = s-1; /* Back up one char, since we've
6102 already incremented s. */
6103 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006104 WRITE_ASCII_CHAR('\\');
6105 WRITE_CHAR(c);
6106 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006108
6109 error:
6110 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006111 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006112 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006113 errors, &errorHandler,
6114 "unicodeescape", message,
6115 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006116 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006117 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006118 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006119 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006120
6121#undef WRITE_ASCII_CHAR
6122#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006124
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006125 Py_XDECREF(errorHandler);
6126 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006128
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006130 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 Py_XDECREF(errorHandler);
6132 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 return NULL;
6134}
6135
Eric V. Smith42454af2016-10-31 09:22:08 -04006136PyObject *
6137PyUnicode_DecodeUnicodeEscape(const char *s,
6138 Py_ssize_t size,
6139 const char *errors)
6140{
6141 const char *first_invalid_escape;
6142 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6143 &first_invalid_escape);
6144 if (result == NULL)
6145 return NULL;
6146 if (first_invalid_escape != NULL) {
6147 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6148 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006149 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006150 Py_DECREF(result);
6151 return NULL;
6152 }
6153 }
6154 return result;
6155}
6156
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006157/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Alexander Belopolsky40018472011-02-26 01:02:56 +00006159PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006165 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
Ezio Melottie7f90372012-10-05 03:33:31 +03006169 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006170 escape.
6171
Ezio Melottie7f90372012-10-05 03:33:31 +03006172 For UCS1 strings it's '\xxx', 4 bytes per source character.
6173 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6174 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006175 */
6176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (!PyUnicode_Check(unicode)) {
6178 PyErr_BadArgument();
6179 return NULL;
6180 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 }
Victor Stinner358af132015-10-12 22:36:57 +02006184
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 if (len == 0) {
6187 return PyBytes_FromStringAndSize(NULL, 0);
6188 }
6189
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 kind = PyUnicode_KIND(unicode);
6191 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6193 bytes, and 1 byte characters 4. */
6194 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006195 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 return PyErr_NoMemory();
6197 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006198 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 if (repr == NULL) {
6200 return NULL;
6201 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006205 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006206
Victor Stinner62ec3312016-09-06 17:04:34 -07006207 /* U+0000-U+00ff range */
6208 if (ch < 0x100) {
6209 if (ch >= ' ' && ch < 127) {
6210 if (ch != '\\') {
6211 /* Copy printable US ASCII as-is */
6212 *p++ = (char) ch;
6213 }
6214 /* Escape backslashes */
6215 else {
6216 *p++ = '\\';
6217 *p++ = '\\';
6218 }
6219 }
Victor Stinner358af132015-10-12 22:36:57 +02006220
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 /* Map special whitespace to '\t', \n', '\r' */
6222 else if (ch == '\t') {
6223 *p++ = '\\';
6224 *p++ = 't';
6225 }
6226 else if (ch == '\n') {
6227 *p++ = '\\';
6228 *p++ = 'n';
6229 }
6230 else if (ch == '\r') {
6231 *p++ = '\\';
6232 *p++ = 'r';
6233 }
6234
6235 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6236 else {
6237 *p++ = '\\';
6238 *p++ = 'x';
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
6241 }
Tim Petersced69f82003-09-16 20:30:58 +00006242 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006243 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 *p++ = '\\';
6246 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006247 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6248 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6249 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6250 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6253 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006254
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 /* Make sure that the first two digits are zero */
6256 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 *p++ = 'U';
6259 *p++ = '0';
6260 *p++ = '0';
6261 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6262 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6263 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6264 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6265 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6266 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 assert(p - PyBytes_AS_STRING(repr) > 0);
6271 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6272 return NULL;
6273 }
6274 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275}
6276
Alexander Belopolsky40018472011-02-26 01:02:56 +00006277PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6279 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006281 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006282 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006285 }
6286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006287 result = PyUnicode_AsUnicodeEscapeString(tmp);
6288 Py_DECREF(tmp);
6289 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290}
6291
6292/* --- Raw Unicode Escape Codec ------------------------------------------- */
6293
Alexander Belopolsky40018472011-02-26 01:02:56 +00006294PyObject *
6295PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006296 Py_ssize_t size,
6297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006300 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006302 PyObject *errorHandler = NULL;
6303 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006304
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 /* Escaped strings will always be longer than the resulting
6310 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 length after conversion to the true value. (But decoding error
6312 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006313 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 writer.min_length = size;
6315 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6316 goto onError;
6317 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006318
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 end = s + size;
6320 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 unsigned char c = (unsigned char) *s++;
6322 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006323 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 Py_ssize_t startinpos;
6325 Py_ssize_t endinpos;
6326 const char *message;
6327
6328#define WRITE_CHAR(ch) \
6329 do { \
6330 if (ch <= writer.maxchar) { \
6331 assert(writer.pos < writer.size); \
6332 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6333 } \
6334 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6335 goto onError; \
6336 } \
6337 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 if (c != '\\' || s >= end) {
6341 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006344
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 c = (unsigned char) *s++;
6346 if (c == 'u') {
6347 count = 4;
6348 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 else if (c == 'U') {
6351 count = 8;
6352 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006353 }
6354 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 assert(writer.pos < writer.size);
6356 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6357 WRITE_CHAR(c);
6358 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006359 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006360 startinpos = s - starts - 2;
6361
6362 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6363 for (ch = 0; count && s < end; ++s, --count) {
6364 c = (unsigned char)*s;
6365 ch <<= 4;
6366 if (c >= '0' && c <= '9') {
6367 ch += c - '0';
6368 }
6369 else if (c >= 'a' && c <= 'f') {
6370 ch += c - ('a' - 10);
6371 }
6372 else if (c >= 'A' && c <= 'F') {
6373 ch += c - ('A' - 10);
6374 }
6375 else {
6376 break;
6377 }
6378 }
6379 if (!count) {
6380 if (ch <= MAX_UNICODE) {
6381 WRITE_CHAR(ch);
6382 continue;
6383 }
6384 message = "\\Uxxxxxxxx out of range";
6385 }
6386
6387 endinpos = s-starts;
6388 writer.min_length = end - s + writer.pos;
6389 if (unicode_decode_call_errorhandler_writer(
6390 errors, &errorHandler,
6391 "rawunicodeescape", message,
6392 &starts, &end, &startinpos, &endinpos, &exc, &s,
6393 &writer)) {
6394 goto onError;
6395 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006396 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006397
6398#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 Py_XDECREF(errorHandler);
6401 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006403
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006405 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 Py_XDECREF(errorHandler);
6407 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410}
6411
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412
Alexander Belopolsky40018472011-02-26 01:02:56 +00006413PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415{
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 int kind;
6420 void *data;
6421 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 if (!PyUnicode_Check(unicode)) {
6424 PyErr_BadArgument();
6425 return NULL;
6426 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430 kind = PyUnicode_KIND(unicode);
6431 data = PyUnicode_DATA(unicode);
6432 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 if (kind == PyUnicode_1BYTE_KIND) {
6434 return PyBytes_FromStringAndSize(data, len);
6435 }
Victor Stinner0e368262011-11-10 20:12:49 +01006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6438 bytes, and 1 byte characters 4. */
6439 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006440
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 if (len > PY_SSIZE_T_MAX / expandsize) {
6442 return PyErr_NoMemory();
6443 }
6444 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6445 if (repr == NULL) {
6446 return NULL;
6447 }
6448 if (len == 0) {
6449 return repr;
6450 }
6451
6452 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 for (pos = 0; pos < len; pos++) {
6454 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006455
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6457 if (ch < 0x100) {
6458 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006459 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006460 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 *p++ = '\\';
6463 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006464 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6465 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6466 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6467 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6470 else {
6471 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6472 *p++ = '\\';
6473 *p++ = 'U';
6474 *p++ = '0';
6475 *p++ = '0';
6476 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6477 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6478 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6479 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6480 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6481 *p++ = Py_hexdigits[ch & 15];
6482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006484
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 assert(p > PyBytes_AS_STRING(repr));
6486 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6487 return NULL;
6488 }
6489 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490}
6491
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6494 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006497 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006498 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006499 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006500 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6501 Py_DECREF(tmp);
6502 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006505/* --- Unicode Internal Codec ------------------------------------------- */
6506
Alexander Belopolsky40018472011-02-26 01:02:56 +00006507PyObject *
6508_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006509 Py_ssize_t size,
6510 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006511{
6512 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006513 Py_ssize_t startinpos;
6514 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006515 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006516 const char *end;
6517 const char *reason;
6518 PyObject *errorHandler = NULL;
6519 PyObject *exc = NULL;
6520
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006522 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006523 1))
6524 return NULL;
6525
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006526 if (size < 0) {
6527 PyErr_BadInternalCall();
6528 return NULL;
6529 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006530 if (size == 0)
6531 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006532
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 _PyUnicodeWriter_Init(&writer);
6534 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6535 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006537 }
6538 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006539
Victor Stinner8f674cc2013-04-17 23:02:17 +02006540 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006541 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006542 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006543 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006544 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006545 endinpos = end-starts;
6546 reason = "truncated input";
6547 goto error;
6548 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549 /* We copy the raw representation one byte at a time because the
6550 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006551 ((char *) &uch)[0] = s[0];
6552 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006553#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006554 ((char *) &uch)[2] = s[2];
6555 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006556#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006557 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006559 /* We have to sanity check the raw data, otherwise doom looms for
6560 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006561 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562 endinpos = s - starts + Py_UNICODE_SIZE;
6563 reason = "illegal code point (> 0x10FFFF)";
6564 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006565 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006566#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 s += Py_UNICODE_SIZE;
6568#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006569 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006571 Py_UNICODE uch2;
6572 ((char *) &uch2)[0] = s[0];
6573 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006574 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 {
Victor Stinner551ac952011-11-29 22:58:13 +01006576 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006577 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 }
6579 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006580#endif
6581
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006582 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006583 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006584 continue;
6585
6586 error:
6587 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006588 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589 errors, &errorHandler,
6590 "unicode_internal", reason,
6591 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006592 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006593 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 }
6595
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006596 Py_XDECREF(errorHandler);
6597 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006598 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006599
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006601 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006602 Py_XDECREF(errorHandler);
6603 Py_XDECREF(exc);
6604 return NULL;
6605}
6606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607/* --- Latin-1 Codec ------------------------------------------------------ */
6608
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609PyObject *
6610PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006611 Py_ssize_t size,
6612 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006615 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616}
6617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619static void
6620make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006621 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006622 PyObject *unicode,
6623 Py_ssize_t startpos, Py_ssize_t endpos,
6624 const char *reason)
6625{
6626 if (*exceptionObject == NULL) {
6627 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006629 encoding, unicode, startpos, endpos, reason);
6630 }
6631 else {
6632 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6633 goto onError;
6634 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6635 goto onError;
6636 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6637 goto onError;
6638 return;
6639 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006640 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006641 }
6642}
6643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006645static void
6646raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006647 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006648 PyObject *unicode,
6649 Py_ssize_t startpos, Py_ssize_t endpos,
6650 const char *reason)
6651{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006652 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006653 encoding, unicode, startpos, endpos, reason);
6654 if (*exceptionObject != NULL)
6655 PyCodec_StrictErrors(*exceptionObject);
6656}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657
6658/* error handling callback helper:
6659 build arguments, call the callback and check the arguments,
6660 put the result into newpos and return the replacement string, which
6661 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static PyObject *
6663unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006664 PyObject **errorHandler,
6665 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006667 Py_ssize_t startpos, Py_ssize_t endpos,
6668 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006670 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 PyObject *restuple;
6673 PyObject *resunicode;
6674
6675 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 }
6680
Benjamin Petersonbac79492012-01-14 13:34:47 -05006681 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 return NULL;
6683 len = PyUnicode_GET_LENGTH(unicode);
6684
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006685 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006686 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006690 restuple = PyObject_CallFunctionObjArgs(
6691 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 Py_DECREF(restuple);
6697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006699 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 &resunicode, newpos)) {
6701 Py_DECREF(restuple);
6702 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006704 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6705 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6706 Py_DECREF(restuple);
6707 return NULL;
6708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006710 *newpos = len + *newpos;
6711 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006712 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 Py_DECREF(restuple);
6714 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006715 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 Py_INCREF(resunicode);
6717 Py_DECREF(restuple);
6718 return resunicode;
6719}
6720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006723 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006724 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 /* input state */
6727 Py_ssize_t pos=0, size;
6728 int kind;
6729 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 /* pointer into the output */
6731 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006732 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6733 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006734 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006736 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006737 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006738 /* output object */
6739 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740
Benjamin Petersonbac79492012-01-14 13:34:47 -05006741 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 return NULL;
6743 size = PyUnicode_GET_LENGTH(unicode);
6744 kind = PyUnicode_KIND(unicode);
6745 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 /* allocate enough for a simple encoding without
6747 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006748 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006749 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006750
6751 _PyBytesWriter_Init(&writer);
6752 str = _PyBytesWriter_Alloc(&writer, size);
6753 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006754 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006757 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006760 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006762 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006766 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006769 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006771
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006772 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006774
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006775 /* Only overallocate the buffer if it's not the last write */
6776 writer.overallocate = (collend < size);
6777
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006779 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006780 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006781
6782 switch (error_handler) {
6783 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006784 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006786
6787 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006788 memset(str, '?', collend - collstart);
6789 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006790 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006791 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 break;
Victor Stinner50149202015-09-22 00:26:54 +02006794
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006796 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006797 writer.min_size -= (collend - collstart);
6798 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006799 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800 if (str == NULL)
6801 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006802 pos = collend;
6803 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006804
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006806 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006807 writer.min_size -= (collend - collstart);
6808 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006809 unicode, collstart, collend);
6810 if (str == NULL)
6811 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006812 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 break;
Victor Stinner50149202015-09-22 00:26:54 +02006814
Victor Stinnerc3713e92015-09-29 12:32:13 +02006815 case _Py_ERROR_SURROGATEESCAPE:
6816 for (i = collstart; i < collend; ++i) {
6817 ch = PyUnicode_READ(kind, data, i);
6818 if (ch < 0xdc80 || 0xdcff < ch) {
6819 /* Not a UTF-8b surrogate */
6820 break;
6821 }
6822 *str++ = (char)(ch - 0xdc00);
6823 ++pos;
6824 }
6825 if (i >= collend)
6826 break;
6827 collstart = pos;
6828 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006829 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006830
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006832 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6833 encoding, reason, unicode, &exc,
6834 collstart, collend, &newpos);
6835 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006837
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006838 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006839 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006840
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006842 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006843 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 PyBytes_AS_STRING(rep),
6845 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006846 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847 else {
6848 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006849
Victor Stinner6bd525b2015-10-09 13:10:05 +02006850 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006852
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006853 if (limit == 256 ?
6854 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6855 !PyUnicode_IS_ASCII(rep))
6856 {
6857 /* Not all characters are smaller than limit */
6858 raise_encode_exception(&exc, encoding, unicode,
6859 collstart, collend, reason);
6860 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006862 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6863 str = _PyBytesWriter_WriteBytes(&writer, str,
6864 PyUnicode_DATA(rep),
6865 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006867 if (str == NULL)
6868 goto onError;
6869
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006870 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006871 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006873
6874 /* If overallocation was disabled, ensure that it was the last
6875 write. Otherwise, we missed an optimization */
6876 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006877 }
6878 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879
Victor Stinner50149202015-09-22 00:26:54 +02006880 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006883
6884 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006885 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006886 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006887 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006888 Py_XDECREF(exc);
6889 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890}
6891
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006892/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893PyObject *
6894PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006895 Py_ssize_t size,
6896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006898 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006899 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 if (unicode == NULL)
6901 return NULL;
6902 result = unicode_encode_ucs1(unicode, errors, 256);
6903 Py_DECREF(unicode);
6904 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905}
6906
Alexander Belopolsky40018472011-02-26 01:02:56 +00006907PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006908_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909{
6910 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 PyErr_BadArgument();
6912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006914 if (PyUnicode_READY(unicode) == -1)
6915 return NULL;
6916 /* Fast path: if it is a one-byte string, construct
6917 bytes object directly. */
6918 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6919 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6920 PyUnicode_GET_LENGTH(unicode));
6921 /* Non-Latin-1 characters present. Defer to above function to
6922 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006923 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006924}
6925
6926PyObject*
6927PyUnicode_AsLatin1String(PyObject *unicode)
6928{
6929 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930}
6931
6932/* --- 7-bit ASCII Codec -------------------------------------------------- */
6933
Alexander Belopolsky40018472011-02-26 01:02:56 +00006934PyObject *
6935PyUnicode_DecodeASCII(const char *s,
6936 Py_ssize_t size,
6937 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006940 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006941 int kind;
6942 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006943 Py_ssize_t startinpos;
6944 Py_ssize_t endinpos;
6945 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006947 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006948 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006949 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006952 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006955 if (size == 1 && (unsigned char)s[0] < 128)
6956 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006957
Victor Stinner8f674cc2013-04-17 23:02:17 +02006958 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006959 writer.min_length = size;
6960 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006961 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006964 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006965 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 writer.pos = outpos;
6967 if (writer.pos == size)
6968 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006969
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 s += writer.pos;
6971 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006972 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006973 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006975 PyUnicode_WRITE(kind, data, writer.pos, c);
6976 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006978 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006980
6981 /* byte outsize range 0x00..0x7f: call the error handler */
6982
6983 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006984 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006985
6986 switch (error_handler)
6987 {
6988 case _Py_ERROR_REPLACE:
6989 case _Py_ERROR_SURROGATEESCAPE:
6990 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006991 but we may switch to UCS2 at the first write */
6992 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6993 goto onError;
6994 kind = writer.kind;
6995 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006996
6997 if (error_handler == _Py_ERROR_REPLACE)
6998 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6999 else
7000 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7001 writer.pos++;
7002 ++s;
7003 break;
7004
7005 case _Py_ERROR_IGNORE:
7006 ++s;
7007 break;
7008
7009 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 startinpos = s-starts;
7011 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007013 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 "ascii", "ordinal not in range(128)",
7015 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 kind = writer.kind;
7019 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007022 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007023 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007024 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007025
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007027 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007028 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 return NULL;
7031}
7032
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007034PyObject *
7035PyUnicode_EncodeASCII(const Py_UNICODE *p,
7036 Py_ssize_t size,
7037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007040 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007041 if (unicode == NULL)
7042 return NULL;
7043 result = unicode_encode_ucs1(unicode, errors, 128);
7044 Py_DECREF(unicode);
7045 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Alexander Belopolsky40018472011-02-26 01:02:56 +00007048PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007049_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050{
7051 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 PyErr_BadArgument();
7053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055 if (PyUnicode_READY(unicode) == -1)
7056 return NULL;
7057 /* Fast path: if it is an ASCII-only string, construct bytes object
7058 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007059 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007060 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7061 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007063}
7064
7065PyObject *
7066PyUnicode_AsASCIIString(PyObject *unicode)
7067{
7068 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069}
7070
Steve Dowercc16be82016-09-08 10:35:16 -07007071#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007072
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007073/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007074
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007075#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076#define NEED_RETRY
7077#endif
7078
Victor Stinner3a50e702011-10-18 21:21:00 +02007079#ifndef WC_ERR_INVALID_CHARS
7080# define WC_ERR_INVALID_CHARS 0x0080
7081#endif
7082
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007083static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007084code_page_name(UINT code_page, PyObject **obj)
7085{
7086 *obj = NULL;
7087 if (code_page == CP_ACP)
7088 return "mbcs";
7089 if (code_page == CP_UTF7)
7090 return "CP_UTF7";
7091 if (code_page == CP_UTF8)
7092 return "CP_UTF8";
7093
7094 *obj = PyBytes_FromFormat("cp%u", code_page);
7095 if (*obj == NULL)
7096 return NULL;
7097 return PyBytes_AS_STRING(*obj);
7098}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099
Victor Stinner3a50e702011-10-18 21:21:00 +02007100static DWORD
7101decode_code_page_flags(UINT code_page)
7102{
7103 if (code_page == CP_UTF7) {
7104 /* The CP_UTF7 decoder only supports flags=0 */
7105 return 0;
7106 }
7107 else
7108 return MB_ERR_INVALID_CHARS;
7109}
7110
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 * Decode a byte string from a Windows code page into unicode object in strict
7113 * mode.
7114 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007115 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7116 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007119decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007120 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const char *in,
7122 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123{
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007125 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007127
7128 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 assert(insize > 0);
7130 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7131 if (outsize <= 0)
7132 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133
7134 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007136 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007137 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 if (*v == NULL)
7139 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141 }
7142 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007144 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007145 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148 }
7149
7150 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7152 if (outsize <= 0)
7153 goto error;
7154 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156error:
7157 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7158 return -2;
7159 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007160 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161}
7162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163/*
7164 * Decode a byte string from a code page into unicode object with an error
7165 * handler.
7166 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007167 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 * UnicodeDecodeError exception and returns -1 on error.
7169 */
7170static int
7171decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007172 PyObject **v,
7173 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007174 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007175{
7176 const char *startin = in;
7177 const char *endin = in + size;
7178 const DWORD flags = decode_code_page_flags(code_page);
7179 /* Ideally, we should get reason from FormatMessage. This is the Windows
7180 2000 English version of the message. */
7181 const char *reason = "No mapping for the Unicode character exists "
7182 "in the target code page.";
7183 /* each step cannot decode more than 1 character, but a character can be
7184 represented as a surrogate pair */
7185 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007186 int insize;
7187 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 PyObject *errorHandler = NULL;
7189 PyObject *exc = NULL;
7190 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007191 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 DWORD err;
7193 int ret = -1;
7194
7195 assert(size > 0);
7196
7197 encoding = code_page_name(code_page, &encoding_obj);
7198 if (encoding == NULL)
7199 return -1;
7200
Victor Stinner7d00cc12014-03-17 23:08:06 +01007201 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7203 UnicodeDecodeError. */
7204 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7205 if (exc != NULL) {
7206 PyCodec_StrictErrors(exc);
7207 Py_CLEAR(exc);
7208 }
7209 goto error;
7210 }
7211
7212 if (*v == NULL) {
7213 /* Create unicode object */
7214 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7215 PyErr_NoMemory();
7216 goto error;
7217 }
Victor Stinnerab595942011-12-17 04:59:06 +01007218 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007219 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 if (*v == NULL)
7221 goto error;
7222 startout = PyUnicode_AS_UNICODE(*v);
7223 }
7224 else {
7225 /* Extend unicode object */
7226 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7227 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7228 PyErr_NoMemory();
7229 goto error;
7230 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007231 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 goto error;
7233 startout = PyUnicode_AS_UNICODE(*v) + n;
7234 }
7235
7236 /* Decode the byte string character per character */
7237 out = startout;
7238 while (in < endin)
7239 {
7240 /* Decode a character */
7241 insize = 1;
7242 do
7243 {
7244 outsize = MultiByteToWideChar(code_page, flags,
7245 in, insize,
7246 buffer, Py_ARRAY_LENGTH(buffer));
7247 if (outsize > 0)
7248 break;
7249 err = GetLastError();
7250 if (err != ERROR_NO_UNICODE_TRANSLATION
7251 && err != ERROR_INSUFFICIENT_BUFFER)
7252 {
7253 PyErr_SetFromWindowsErr(0);
7254 goto error;
7255 }
7256 insize++;
7257 }
7258 /* 4=maximum length of a UTF-8 sequence */
7259 while (insize <= 4 && (in + insize) <= endin);
7260
7261 if (outsize <= 0) {
7262 Py_ssize_t startinpos, endinpos, outpos;
7263
Victor Stinner7d00cc12014-03-17 23:08:06 +01007264 /* last character in partial decode? */
7265 if (in + insize >= endin && !final)
7266 break;
7267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 startinpos = in - startin;
7269 endinpos = startinpos + 1;
7270 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007271 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 errors, &errorHandler,
7273 encoding, reason,
7274 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007275 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 {
7277 goto error;
7278 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007279 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 }
7281 else {
7282 in += insize;
7283 memcpy(out, buffer, outsize * sizeof(wchar_t));
7284 out += outsize;
7285 }
7286 }
7287
7288 /* write a NUL character at the end */
7289 *out = 0;
7290
7291 /* Extend unicode object */
7292 outsize = out - startout;
7293 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007294 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007296 /* (in - startin) <= size and size is an int */
7297 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007298
7299error:
7300 Py_XDECREF(encoding_obj);
7301 Py_XDECREF(errorHandler);
7302 Py_XDECREF(exc);
7303 return ret;
7304}
7305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306static PyObject *
7307decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007308 const char *s, Py_ssize_t size,
7309 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310{
Victor Stinner76a31a62011-11-04 00:05:13 +01007311 PyObject *v = NULL;
7312 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 if (code_page < 0) {
7315 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7316 return NULL;
7317 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007318 if (size < 0) {
7319 PyErr_BadInternalCall();
7320 return NULL;
7321 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007322
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 do
7327 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007329 if (size > INT_MAX) {
7330 chunk_size = INT_MAX;
7331 final = 0;
7332 done = 0;
7333 }
7334 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007335#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 {
7337 chunk_size = (int)size;
7338 final = (consumed == NULL);
7339 done = 1;
7340 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 if (chunk_size == 0 && done) {
7343 if (v != NULL)
7344 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007345 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007346 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347
Victor Stinner76a31a62011-11-04 00:05:13 +01007348 converted = decode_code_page_strict(code_page, &v,
7349 s, chunk_size);
7350 if (converted == -2)
7351 converted = decode_code_page_errors(code_page, &v,
7352 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007353 errors, final);
7354 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007355
7356 if (converted < 0) {
7357 Py_XDECREF(v);
7358 return NULL;
7359 }
7360
7361 if (consumed)
7362 *consumed += converted;
7363
7364 s += converted;
7365 size -= converted;
7366 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007367
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007368 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369}
7370
Alexander Belopolsky40018472011-02-26 01:02:56 +00007371PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007372PyUnicode_DecodeCodePageStateful(int code_page,
7373 const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7379}
7380
7381PyObject *
7382PyUnicode_DecodeMBCSStateful(const char *s,
7383 Py_ssize_t size,
7384 const char *errors,
7385 Py_ssize_t *consumed)
7386{
7387 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7388}
7389
7390PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007391PyUnicode_DecodeMBCS(const char *s,
7392 Py_ssize_t size,
7393 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007394{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7396}
7397
Victor Stinner3a50e702011-10-18 21:21:00 +02007398static DWORD
7399encode_code_page_flags(UINT code_page, const char *errors)
7400{
7401 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007402 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 }
7404 else if (code_page == CP_UTF7) {
7405 /* CP_UTF7 only supports flags=0 */
7406 return 0;
7407 }
7408 else {
7409 if (errors != NULL && strcmp(errors, "replace") == 0)
7410 return 0;
7411 else
7412 return WC_NO_BEST_FIT_CHARS;
7413 }
7414}
7415
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007416/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 * Encode a Unicode string to a Windows code page into a byte string in strict
7418 * mode.
7419 *
7420 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007421 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007424encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007425 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427{
Victor Stinner554f3f02010-06-16 23:33:54 +00007428 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 BOOL *pusedDefaultChar = &usedDefaultChar;
7430 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007431 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007432 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 const DWORD flags = encode_code_page_flags(code_page, NULL);
7434 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 /* Create a substring so that we can get the UTF-16 representation
7436 of just the slice under consideration. */
7437 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007442 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007444 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007445
Victor Stinner2fc507f2011-11-04 20:06:39 +01007446 substring = PyUnicode_Substring(unicode, offset, offset+len);
7447 if (substring == NULL)
7448 return -1;
7449 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7450 if (p == NULL) {
7451 Py_DECREF(substring);
7452 return -1;
7453 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007454 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007455
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007456 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007458 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 NULL, 0,
7460 NULL, pusedDefaultChar);
7461 if (outsize <= 0)
7462 goto error;
7463 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 if (pusedDefaultChar && *pusedDefaultChar) {
7465 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007468
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 if (*outbytes == NULL) {
7473 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477 }
7478 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 const Py_ssize_t n = PyBytes_Size(*outbytes);
7481 if (outsize > PY_SSIZE_T_MAX - n) {
7482 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007486 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7487 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491 }
7492
7493 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007495 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 out, outsize,
7497 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (outsize <= 0)
7500 goto error;
7501 if (pusedDefaultChar && *pusedDefaultChar)
7502 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007503 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7508 return -2;
7509 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007510 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007511}
7512
Victor Stinner3a50e702011-10-18 21:21:00 +02007513/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007514 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 * error handler.
7516 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007517 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 * -1 on other error.
7519 */
7520static int
7521encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007522 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007523 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007524{
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007526 Py_ssize_t pos = unicode_offset;
7527 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 /* Ideally, we should get reason from FormatMessage. This is the Windows
7529 2000 English version of the message. */
7530 const char *reason = "invalid character";
7531 /* 4=maximum length of a UTF-8 sequence */
7532 char buffer[4];
7533 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7534 Py_ssize_t outsize;
7535 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 PyObject *errorHandler = NULL;
7537 PyObject *exc = NULL;
7538 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007539 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007540 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 PyObject *rep;
7542 int ret = -1;
7543
7544 assert(insize > 0);
7545
7546 encoding = code_page_name(code_page, &encoding_obj);
7547 if (encoding == NULL)
7548 return -1;
7549
7550 if (errors == NULL || strcmp(errors, "strict") == 0) {
7551 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7552 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007553 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 if (exc != NULL) {
7555 PyCodec_StrictErrors(exc);
7556 Py_DECREF(exc);
7557 }
7558 Py_XDECREF(encoding_obj);
7559 return -1;
7560 }
7561
7562 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7563 pusedDefaultChar = &usedDefaultChar;
7564 else
7565 pusedDefaultChar = NULL;
7566
7567 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7568 PyErr_NoMemory();
7569 goto error;
7570 }
7571 outsize = insize * Py_ARRAY_LENGTH(buffer);
7572
7573 if (*outbytes == NULL) {
7574 /* Create string object */
7575 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7576 if (*outbytes == NULL)
7577 goto error;
7578 out = PyBytes_AS_STRING(*outbytes);
7579 }
7580 else {
7581 /* Extend string object */
7582 Py_ssize_t n = PyBytes_Size(*outbytes);
7583 if (n > PY_SSIZE_T_MAX - outsize) {
7584 PyErr_NoMemory();
7585 goto error;
7586 }
7587 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7588 goto error;
7589 out = PyBytes_AS_STRING(*outbytes) + n;
7590 }
7591
7592 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007593 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007595 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7596 wchar_t chars[2];
7597 int charsize;
7598 if (ch < 0x10000) {
7599 chars[0] = (wchar_t)ch;
7600 charsize = 1;
7601 }
7602 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007603 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7604 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007605 charsize = 2;
7606 }
7607
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007609 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 buffer, Py_ARRAY_LENGTH(buffer),
7611 NULL, pusedDefaultChar);
7612 if (outsize > 0) {
7613 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7614 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007615 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 memcpy(out, buffer, outsize);
7617 out += outsize;
7618 continue;
7619 }
7620 }
7621 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7622 PyErr_SetFromWindowsErr(0);
7623 goto error;
7624 }
7625
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 rep = unicode_encode_call_errorhandler(
7627 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 if (rep == NULL)
7631 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007632 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007633
7634 if (PyBytes_Check(rep)) {
7635 outsize = PyBytes_GET_SIZE(rep);
7636 if (outsize != 1) {
7637 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7638 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7639 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7640 Py_DECREF(rep);
7641 goto error;
7642 }
7643 out = PyBytes_AS_STRING(*outbytes) + offset;
7644 }
7645 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7646 out += outsize;
7647 }
7648 else {
7649 Py_ssize_t i;
7650 enum PyUnicode_Kind kind;
7651 void *data;
7652
Benjamin Petersonbac79492012-01-14 13:34:47 -05007653 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 Py_DECREF(rep);
7655 goto error;
7656 }
7657
7658 outsize = PyUnicode_GET_LENGTH(rep);
7659 if (outsize != 1) {
7660 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7661 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7662 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7663 Py_DECREF(rep);
7664 goto error;
7665 }
7666 out = PyBytes_AS_STRING(*outbytes) + offset;
7667 }
7668 kind = PyUnicode_KIND(rep);
7669 data = PyUnicode_DATA(rep);
7670 for (i=0; i < outsize; i++) {
7671 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7672 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007673 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 encoding, unicode,
7675 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 "unable to encode error handler result to ASCII");
7677 Py_DECREF(rep);
7678 goto error;
7679 }
7680 *out = (unsigned char)ch;
7681 out++;
7682 }
7683 }
7684 Py_DECREF(rep);
7685 }
7686 /* write a NUL byte */
7687 *out = 0;
7688 outsize = out - PyBytes_AS_STRING(*outbytes);
7689 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7690 if (_PyBytes_Resize(outbytes, outsize) < 0)
7691 goto error;
7692 ret = 0;
7693
7694error:
7695 Py_XDECREF(encoding_obj);
7696 Py_XDECREF(errorHandler);
7697 Py_XDECREF(exc);
7698 return ret;
7699}
7700
Victor Stinner3a50e702011-10-18 21:21:00 +02007701static PyObject *
7702encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 const char *errors)
7705{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007706 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007707 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007708 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007710
Victor Stinner29dacf22015-01-26 16:41:32 +01007711 if (!PyUnicode_Check(unicode)) {
7712 PyErr_BadArgument();
7713 return NULL;
7714 }
7715
Benjamin Petersonbac79492012-01-14 13:34:47 -05007716 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717 return NULL;
7718 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007719
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 if (code_page < 0) {
7721 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7722 return NULL;
7723 }
7724
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007726 return PyBytes_FromStringAndSize(NULL, 0);
7727
Victor Stinner7581cef2011-11-03 22:32:33 +01007728 offset = 0;
7729 do
7730 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007731#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007732 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 chunks. */
7734 if (len > INT_MAX/2) {
7735 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 done = 0;
7737 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007739#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 done = 1;
7743 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007744
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007747 errors);
7748 if (ret == -2)
7749 ret = encode_code_page_errors(code_page, &outbytes,
7750 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 if (ret < 0) {
7753 Py_XDECREF(outbytes);
7754 return NULL;
7755 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007760
Victor Stinner3a50e702011-10-18 21:21:00 +02007761 return outbytes;
7762}
7763
7764PyObject *
7765PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7766 Py_ssize_t size,
7767 const char *errors)
7768{
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007770 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 if (unicode == NULL)
7772 return NULL;
7773 res = encode_code_page(CP_ACP, unicode, errors);
7774 Py_DECREF(unicode);
7775 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007776}
7777
7778PyObject *
7779PyUnicode_EncodeCodePage(int code_page,
7780 PyObject *unicode,
7781 const char *errors)
7782{
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007784}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007785
Alexander Belopolsky40018472011-02-26 01:02:56 +00007786PyObject *
7787PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007788{
Victor Stinner7581cef2011-11-03 22:32:33 +01007789 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007790}
7791
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007792#undef NEED_RETRY
7793
Steve Dowercc16be82016-09-08 10:35:16 -07007794#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796/* --- Character Mapping Codec -------------------------------------------- */
7797
Victor Stinnerfb161b12013-04-18 01:44:27 +02007798static int
7799charmap_decode_string(const char *s,
7800 Py_ssize_t size,
7801 PyObject *mapping,
7802 const char *errors,
7803 _PyUnicodeWriter *writer)
7804{
7805 const char *starts = s;
7806 const char *e;
7807 Py_ssize_t startinpos, endinpos;
7808 PyObject *errorHandler = NULL, *exc = NULL;
7809 Py_ssize_t maplen;
7810 enum PyUnicode_Kind mapkind;
7811 void *mapdata;
7812 Py_UCS4 x;
7813 unsigned char ch;
7814
7815 if (PyUnicode_READY(mapping) == -1)
7816 return -1;
7817
7818 maplen = PyUnicode_GET_LENGTH(mapping);
7819 mapdata = PyUnicode_DATA(mapping);
7820 mapkind = PyUnicode_KIND(mapping);
7821
7822 e = s + size;
7823
7824 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7825 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7826 * is disabled in encoding aliases, latin1 is preferred because
7827 * its implementation is faster. */
7828 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7829 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7830 Py_UCS4 maxchar = writer->maxchar;
7831
7832 assert (writer->kind == PyUnicode_1BYTE_KIND);
7833 while (s < e) {
7834 ch = *s;
7835 x = mapdata_ucs1[ch];
7836 if (x > maxchar) {
7837 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7838 goto onError;
7839 maxchar = writer->maxchar;
7840 outdata = (Py_UCS1 *)writer->data;
7841 }
7842 outdata[writer->pos] = x;
7843 writer->pos++;
7844 ++s;
7845 }
7846 return 0;
7847 }
7848
7849 while (s < e) {
7850 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7851 enum PyUnicode_Kind outkind = writer->kind;
7852 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7853 if (outkind == PyUnicode_1BYTE_KIND) {
7854 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7855 Py_UCS4 maxchar = writer->maxchar;
7856 while (s < e) {
7857 ch = *s;
7858 x = mapdata_ucs2[ch];
7859 if (x > maxchar)
7860 goto Error;
7861 outdata[writer->pos] = x;
7862 writer->pos++;
7863 ++s;
7864 }
7865 break;
7866 }
7867 else if (outkind == PyUnicode_2BYTE_KIND) {
7868 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7869 while (s < e) {
7870 ch = *s;
7871 x = mapdata_ucs2[ch];
7872 if (x == 0xFFFE)
7873 goto Error;
7874 outdata[writer->pos] = x;
7875 writer->pos++;
7876 ++s;
7877 }
7878 break;
7879 }
7880 }
7881 ch = *s;
7882
7883 if (ch < maplen)
7884 x = PyUnicode_READ(mapkind, mapdata, ch);
7885 else
7886 x = 0xfffe; /* invalid value */
7887Error:
7888 if (x == 0xfffe)
7889 {
7890 /* undefined mapping */
7891 startinpos = s-starts;
7892 endinpos = startinpos+1;
7893 if (unicode_decode_call_errorhandler_writer(
7894 errors, &errorHandler,
7895 "charmap", "character maps to <undefined>",
7896 &starts, &e, &startinpos, &endinpos, &exc, &s,
7897 writer)) {
7898 goto onError;
7899 }
7900 continue;
7901 }
7902
7903 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7904 goto onError;
7905 ++s;
7906 }
7907 Py_XDECREF(errorHandler);
7908 Py_XDECREF(exc);
7909 return 0;
7910
7911onError:
7912 Py_XDECREF(errorHandler);
7913 Py_XDECREF(exc);
7914 return -1;
7915}
7916
7917static int
7918charmap_decode_mapping(const char *s,
7919 Py_ssize_t size,
7920 PyObject *mapping,
7921 const char *errors,
7922 _PyUnicodeWriter *writer)
7923{
7924 const char *starts = s;
7925 const char *e;
7926 Py_ssize_t startinpos, endinpos;
7927 PyObject *errorHandler = NULL, *exc = NULL;
7928 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007929 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007930
7931 e = s + size;
7932
7933 while (s < e) {
7934 ch = *s;
7935
7936 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7937 key = PyLong_FromLong((long)ch);
7938 if (key == NULL)
7939 goto onError;
7940
7941 item = PyObject_GetItem(mapping, key);
7942 Py_DECREF(key);
7943 if (item == NULL) {
7944 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7945 /* No mapping found means: mapping is undefined. */
7946 PyErr_Clear();
7947 goto Undefined;
7948 } else
7949 goto onError;
7950 }
7951
7952 /* Apply mapping */
7953 if (item == Py_None)
7954 goto Undefined;
7955 if (PyLong_Check(item)) {
7956 long value = PyLong_AS_LONG(item);
7957 if (value == 0xFFFE)
7958 goto Undefined;
7959 if (value < 0 || value > MAX_UNICODE) {
7960 PyErr_Format(PyExc_TypeError,
7961 "character mapping must be in range(0x%lx)",
7962 (unsigned long)MAX_UNICODE + 1);
7963 goto onError;
7964 }
7965
7966 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7967 goto onError;
7968 }
7969 else if (PyUnicode_Check(item)) {
7970 if (PyUnicode_READY(item) == -1)
7971 goto onError;
7972 if (PyUnicode_GET_LENGTH(item) == 1) {
7973 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7974 if (value == 0xFFFE)
7975 goto Undefined;
7976 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7977 goto onError;
7978 }
7979 else {
7980 writer->overallocate = 1;
7981 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7982 goto onError;
7983 }
7984 }
7985 else {
7986 /* wrong return value */
7987 PyErr_SetString(PyExc_TypeError,
7988 "character mapping must return integer, None or str");
7989 goto onError;
7990 }
7991 Py_CLEAR(item);
7992 ++s;
7993 continue;
7994
7995Undefined:
7996 /* undefined mapping */
7997 Py_CLEAR(item);
7998 startinpos = s-starts;
7999 endinpos = startinpos+1;
8000 if (unicode_decode_call_errorhandler_writer(
8001 errors, &errorHandler,
8002 "charmap", "character maps to <undefined>",
8003 &starts, &e, &startinpos, &endinpos, &exc, &s,
8004 writer)) {
8005 goto onError;
8006 }
8007 }
8008 Py_XDECREF(errorHandler);
8009 Py_XDECREF(exc);
8010 return 0;
8011
8012onError:
8013 Py_XDECREF(item);
8014 Py_XDECREF(errorHandler);
8015 Py_XDECREF(exc);
8016 return -1;
8017}
8018
Alexander Belopolsky40018472011-02-26 01:02:56 +00008019PyObject *
8020PyUnicode_DecodeCharmap(const char *s,
8021 Py_ssize_t size,
8022 PyObject *mapping,
8023 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008025 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008026
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 /* Default to Latin-1 */
8028 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008032 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008033 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008034 writer.min_length = size;
8035 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008038 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008039 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8040 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008041 }
8042 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008043 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8044 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008046 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008047
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008049 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 return NULL;
8051}
8052
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053/* Charmap encoding: the lookup table */
8054
Alexander Belopolsky40018472011-02-26 01:02:56 +00008055struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 PyObject_HEAD
8057 unsigned char level1[32];
8058 int count2, count3;
8059 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060};
8061
8062static PyObject*
8063encoding_map_size(PyObject *obj, PyObject* args)
8064{
8065 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068}
8069
8070static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 PyDoc_STR("Return the size (in bytes) of this object") },
8073 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008074};
8075
8076static void
8077encoding_map_dealloc(PyObject* o)
8078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080}
8081
8082static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 "EncodingMap", /*tp_name*/
8085 sizeof(struct encoding_map), /*tp_basicsize*/
8086 0, /*tp_itemsize*/
8087 /* methods */
8088 encoding_map_dealloc, /*tp_dealloc*/
8089 0, /*tp_print*/
8090 0, /*tp_getattr*/
8091 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008092 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 0, /*tp_repr*/
8094 0, /*tp_as_number*/
8095 0, /*tp_as_sequence*/
8096 0, /*tp_as_mapping*/
8097 0, /*tp_hash*/
8098 0, /*tp_call*/
8099 0, /*tp_str*/
8100 0, /*tp_getattro*/
8101 0, /*tp_setattro*/
8102 0, /*tp_as_buffer*/
8103 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8104 0, /*tp_doc*/
8105 0, /*tp_traverse*/
8106 0, /*tp_clear*/
8107 0, /*tp_richcompare*/
8108 0, /*tp_weaklistoffset*/
8109 0, /*tp_iter*/
8110 0, /*tp_iternext*/
8111 encoding_map_methods, /*tp_methods*/
8112 0, /*tp_members*/
8113 0, /*tp_getset*/
8114 0, /*tp_base*/
8115 0, /*tp_dict*/
8116 0, /*tp_descr_get*/
8117 0, /*tp_descr_set*/
8118 0, /*tp_dictoffset*/
8119 0, /*tp_init*/
8120 0, /*tp_alloc*/
8121 0, /*tp_new*/
8122 0, /*tp_free*/
8123 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124};
8125
8126PyObject*
8127PyUnicode_BuildEncodingMap(PyObject* string)
8128{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 PyObject *result;
8130 struct encoding_map *mresult;
8131 int i;
8132 int need_dict = 0;
8133 unsigned char level1[32];
8134 unsigned char level2[512];
8135 unsigned char *mlevel1, *mlevel2, *mlevel3;
8136 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 int kind;
8138 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008142 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 PyErr_BadArgument();
8144 return NULL;
8145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 kind = PyUnicode_KIND(string);
8147 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008148 length = PyUnicode_GET_LENGTH(string);
8149 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 memset(level1, 0xFF, sizeof level1);
8151 memset(level2, 0xFF, sizeof level2);
8152
8153 /* If there isn't a one-to-one mapping of NULL to \0,
8154 or if there are non-BMP characters, we need to use
8155 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008158 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 ch = PyUnicode_READ(kind, data, i);
8161 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 need_dict = 1;
8163 break;
8164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 /* unmapped character */
8167 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 l1 = ch >> 11;
8169 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 if (level1[l1] == 0xFF)
8171 level1[l1] = count2++;
8172 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 }
8175
8176 if (count2 >= 0xFF || count3 >= 0xFF)
8177 need_dict = 1;
8178
8179 if (need_dict) {
8180 PyObject *result = PyDict_New();
8181 PyObject *key, *value;
8182 if (!result)
8183 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008184 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008186 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008187 if (!key || !value)
8188 goto failed1;
8189 if (PyDict_SetItem(result, key, value) == -1)
8190 goto failed1;
8191 Py_DECREF(key);
8192 Py_DECREF(value);
8193 }
8194 return result;
8195 failed1:
8196 Py_XDECREF(key);
8197 Py_XDECREF(value);
8198 Py_DECREF(result);
8199 return NULL;
8200 }
8201
8202 /* Create a three-level trie */
8203 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8204 16*count2 + 128*count3 - 1);
8205 if (!result)
8206 return PyErr_NoMemory();
8207 PyObject_Init(result, &EncodingMapType);
8208 mresult = (struct encoding_map*)result;
8209 mresult->count2 = count2;
8210 mresult->count3 = count3;
8211 mlevel1 = mresult->level1;
8212 mlevel2 = mresult->level23;
8213 mlevel3 = mresult->level23 + 16*count2;
8214 memcpy(mlevel1, level1, 32);
8215 memset(mlevel2, 0xFF, 16*count2);
8216 memset(mlevel3, 0, 128*count3);
8217 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008218 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008219 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8221 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 /* unmapped character */
8223 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008224 o1 = ch>>11;
8225 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 i2 = 16*mlevel1[o1] + o2;
8227 if (mlevel2[i2] == 0xFF)
8228 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008229 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230 i3 = 128*mlevel2[i2] + o3;
8231 mlevel3[i3] = i;
8232 }
8233 return result;
8234}
8235
8236static int
Victor Stinner22168992011-11-20 17:09:18 +01008237encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238{
8239 struct encoding_map *map = (struct encoding_map*)mapping;
8240 int l1 = c>>11;
8241 int l2 = (c>>7) & 0xF;
8242 int l3 = c & 0x7F;
8243 int i;
8244
Victor Stinner22168992011-11-20 17:09:18 +01008245 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008247 if (c == 0)
8248 return 0;
8249 /* level 1*/
8250 i = map->level1[l1];
8251 if (i == 0xFF) {
8252 return -1;
8253 }
8254 /* level 2*/
8255 i = map->level23[16*i+l2];
8256 if (i == 0xFF) {
8257 return -1;
8258 }
8259 /* level 3 */
8260 i = map->level23[16*map->count2 + 128*i + l3];
8261 if (i == 0) {
8262 return -1;
8263 }
8264 return i;
8265}
8266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267/* Lookup the character ch in the mapping. If the character
8268 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008269 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008271charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272{
Christian Heimes217cfd12007-12-02 14:31:20 +00008273 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 PyObject *x;
8275
8276 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 x = PyObject_GetItem(mapping, w);
8279 Py_DECREF(w);
8280 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8282 /* No mapping found means: mapping is undefined. */
8283 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008284 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 } else
8286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008288 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008290 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 long value = PyLong_AS_LONG(x);
8292 if (value < 0 || value > 255) {
8293 PyErr_SetString(PyExc_TypeError,
8294 "character mapping must be in range(256)");
8295 Py_DECREF(x);
8296 return NULL;
8297 }
8298 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008300 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 /* wrong return value */
8304 PyErr_Format(PyExc_TypeError,
8305 "character mapping must return integer, bytes or None, not %.400s",
8306 x->ob_type->tp_name);
8307 Py_DECREF(x);
8308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 }
8310}
8311
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008313charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8316 /* exponentially overallocate to minimize reallocations */
8317 if (requiredsize < 2*outsize)
8318 requiredsize = 2*outsize;
8319 if (_PyBytes_Resize(outobj, requiredsize))
8320 return -1;
8321 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322}
8323
Benjamin Peterson14339b62009-01-31 16:36:08 +00008324typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008328 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 space is available. Return a new reference to the object that
8330 was put in the output buffer, or Py_None, if the mapping was undefined
8331 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008332 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008334charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008335 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 PyObject *rep;
8338 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008339 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340
Christian Heimes90aa7642007-12-19 02:45:37 +00008341 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 if (res == -1)
8345 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 if (outsize<requiredsize)
8347 if (charmapencode_resize(outobj, outpos, requiredsize))
8348 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008349 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 outstart[(*outpos)++] = (char)res;
8351 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 }
8353
8354 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 Py_DECREF(rep);
8359 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008360 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 if (PyLong_Check(rep)) {
8362 Py_ssize_t requiredsize = *outpos+1;
8363 if (outsize<requiredsize)
8364 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8365 Py_DECREF(rep);
8366 return enc_EXCEPTION;
8367 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008368 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 else {
8372 const char *repchars = PyBytes_AS_STRING(rep);
8373 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8374 Py_ssize_t requiredsize = *outpos+repsize;
8375 if (outsize<requiredsize)
8376 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8377 Py_DECREF(rep);
8378 return enc_EXCEPTION;
8379 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008380 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 memcpy(outstart + *outpos, repchars, repsize);
8382 *outpos += repsize;
8383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385 Py_DECREF(rep);
8386 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387}
8388
8389/* handle an error in PyUnicode_EncodeCharmap
8390 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008391static int
8392charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008393 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008395 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008396 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397{
8398 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008399 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008400 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008401 enum PyUnicode_Kind kind;
8402 void *data;
8403 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008405 Py_ssize_t collstartpos = *inpos;
8406 Py_ssize_t collendpos = *inpos+1;
8407 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008408 const char *encoding = "charmap";
8409 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008412 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413
Benjamin Petersonbac79492012-01-14 13:34:47 -05008414 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008415 return -1;
8416 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 /* find all unencodable characters */
8418 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008419 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008420 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008421 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008422 val = encoding_map_lookup(ch, mapping);
8423 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 break;
8425 ++collendpos;
8426 continue;
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008429 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8430 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 if (rep==NULL)
8432 return -1;
8433 else if (rep!=Py_None) {
8434 Py_DECREF(rep);
8435 break;
8436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 }
8440 /* cache callback name lookup
8441 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008442 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008443 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008444
8445 switch (*error_handler) {
8446 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008447 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008449
8450 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 x = charmapencode_output('?', mapping, res, respos);
8453 if (x==enc_EXCEPTION) {
8454 return -1;
8455 }
8456 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008457 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
8459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 }
8461 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008462 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 *inpos = collendpos;
8464 break;
Victor Stinner50149202015-09-22 00:26:54 +02008465
8466 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467 /* generate replacement (temporarily (mis)uses p) */
8468 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 char buffer[2+29+1+1];
8470 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008471 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 for (cp = buffer; *cp; ++cp) {
8473 x = charmapencode_output(*cp, mapping, res, respos);
8474 if (x==enc_EXCEPTION)
8475 return -1;
8476 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008477 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return -1;
8479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 }
8481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 *inpos = collendpos;
8483 break;
Victor Stinner50149202015-09-22 00:26:54 +02008484
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 default:
Victor Stinner50149202015-09-22 00:26:54 +02008486 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008487 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008491 if (PyBytes_Check(repunicode)) {
8492 /* Directly copy bytes result to output. */
8493 Py_ssize_t outsize = PyBytes_Size(*res);
8494 Py_ssize_t requiredsize;
8495 repsize = PyBytes_Size(repunicode);
8496 requiredsize = *respos + repsize;
8497 if (requiredsize > outsize)
8498 /* Make room for all additional bytes. */
8499 if (charmapencode_resize(res, respos, requiredsize)) {
8500 Py_DECREF(repunicode);
8501 return -1;
8502 }
8503 memcpy(PyBytes_AsString(*res) + *respos,
8504 PyBytes_AsString(repunicode), repsize);
8505 *respos += repsize;
8506 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008507 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008508 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008511 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 Py_DECREF(repunicode);
8513 return -1;
8514 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008515 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008516 data = PyUnicode_DATA(repunicode);
8517 kind = PyUnicode_KIND(repunicode);
8518 for (index = 0; index < repsize; index++) {
8519 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8520 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008522 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
8524 }
8525 else if (x==enc_FAILED) {
8526 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008527 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return -1;
8529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 }
8531 *inpos = newpos;
8532 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 }
8534 return 0;
8535}
8536
Alexander Belopolsky40018472011-02-26 01:02:56 +00008537PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538_PyUnicode_EncodeCharmap(PyObject *unicode,
8539 PyObject *mapping,
8540 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 /* output object */
8543 PyObject *res = NULL;
8544 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008545 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008548 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008549 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008551 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008552 void *data;
8553 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Benjamin Petersonbac79492012-01-14 13:34:47 -05008555 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 return NULL;
8557 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008558 data = PyUnicode_DATA(unicode);
8559 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 /* Default to Latin-1 */
8562 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008563 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 /* allocate enough for a simple encoding without
8566 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008567 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 if (res == NULL)
8569 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008570 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008574 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 if (x==enc_EXCEPTION) /* error */
8578 goto onError;
8579 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008582 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 &res, &respos)) {
8584 goto onError;
8585 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 else
8588 /* done with this character => adjust input position */
8589 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008593 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008594 if (_PyBytes_Resize(&res, respos) < 0)
8595 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008598 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 return res;
8600
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 Py_XDECREF(res);
8603 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008604 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 return NULL;
8606}
8607
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608/* Deprecated */
8609PyObject *
8610PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8611 Py_ssize_t size,
8612 PyObject *mapping,
8613 const char *errors)
8614{
8615 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008616 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617 if (unicode == NULL)
8618 return NULL;
8619 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8620 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008621 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008622}
8623
Alexander Belopolsky40018472011-02-26 01:02:56 +00008624PyObject *
8625PyUnicode_AsCharmapString(PyObject *unicode,
8626 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627{
8628 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 PyErr_BadArgument();
8630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008632 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633}
8634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636static void
8637make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639 Py_ssize_t startpos, Py_ssize_t endpos,
8640 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 *exceptionObject = _PyUnicodeTranslateError_Create(
8644 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 }
8646 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8648 goto onError;
8649 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8650 goto onError;
8651 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8652 goto onError;
8653 return;
8654 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008655 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 }
8657}
8658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659/* error handling callback helper:
8660 build arguments, call the callback and check the arguments,
8661 put the result into newpos and return the replacement string, which
8662 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663static PyObject *
8664unicode_translate_call_errorhandler(const char *errors,
8665 PyObject **errorHandler,
8666 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008668 Py_ssize_t startpos, Py_ssize_t endpos,
8669 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008671 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008673 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 PyObject *restuple;
8675 PyObject *resunicode;
8676
8677 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 }
8682
8683 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008688 restuple = PyObject_CallFunctionObjArgs(
8689 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008693 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008697 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 &resunicode, &i_newpos)) {
8699 Py_DECREF(restuple);
8700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008702 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008704 else
8705 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008707 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 Py_DECREF(restuple);
8709 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 Py_INCREF(resunicode);
8712 Py_DECREF(restuple);
8713 return resunicode;
8714}
8715
8716/* Lookup the character ch in the mapping and put the result in result,
8717 which must be decrefed by the caller.
8718 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008719static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721{
Christian Heimes217cfd12007-12-02 14:31:20 +00008722 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 PyObject *x;
8724
8725 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 x = PyObject_GetItem(mapping, w);
8728 Py_DECREF(w);
8729 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8731 /* No mapping found means: use 1:1 mapping. */
8732 PyErr_Clear();
8733 *result = NULL;
8734 return 0;
8735 } else
8736 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 }
8738 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 *result = x;
8740 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008742 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008744 if (value < 0 || value > MAX_UNICODE) {
8745 PyErr_Format(PyExc_ValueError,
8746 "character mapping must be in range(0x%x)",
8747 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 Py_DECREF(x);
8749 return -1;
8750 }
8751 *result = x;
8752 return 0;
8753 }
8754 else if (PyUnicode_Check(x)) {
8755 *result = x;
8756 return 0;
8757 }
8758 else {
8759 /* wrong return value */
8760 PyErr_SetString(PyExc_TypeError,
8761 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008762 Py_DECREF(x);
8763 return -1;
8764 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765}
Victor Stinner1194ea02014-04-04 19:37:40 +02008766
8767/* lookup the character, write the result into the writer.
8768 Return 1 if the result was written into the writer, return 0 if the mapping
8769 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008771charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8772 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773{
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 PyObject *item;
8775
8776 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008778
8779 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008781 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008784 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008786
8787 if (item == Py_None) {
8788 Py_DECREF(item);
8789 return 0;
8790 }
8791
8792 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008793 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8794 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8795 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008796 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8797 Py_DECREF(item);
8798 return -1;
8799 }
8800 Py_DECREF(item);
8801 return 1;
8802 }
8803
8804 if (!PyUnicode_Check(item)) {
8805 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008807 }
8808
8809 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8810 Py_DECREF(item);
8811 return -1;
8812 }
8813
8814 Py_DECREF(item);
8815 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816}
8817
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818static int
8819unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8820 Py_UCS1 *translate)
8821{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008822 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 int ret = 0;
8824
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 if (charmaptranslate_lookup(ch, mapping, &item)) {
8826 return -1;
8827 }
8828
8829 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008830 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008831 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008832 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008833 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 /* not found => default to 1:1 mapping */
8835 translate[ch] = ch;
8836 return 1;
8837 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008838 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008839 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008840 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8841 used it */
8842 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 /* invalid character or character outside ASCII:
8844 skip the fast translate */
8845 goto exit;
8846 }
8847 translate[ch] = (Py_UCS1)replace;
8848 }
8849 else if (PyUnicode_Check(item)) {
8850 Py_UCS4 replace;
8851
8852 if (PyUnicode_READY(item) == -1) {
8853 Py_DECREF(item);
8854 return -1;
8855 }
8856 if (PyUnicode_GET_LENGTH(item) != 1)
8857 goto exit;
8858
8859 replace = PyUnicode_READ_CHAR(item, 0);
8860 if (replace > 127)
8861 goto exit;
8862 translate[ch] = (Py_UCS1)replace;
8863 }
8864 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008865 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 goto exit;
8867 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 ret = 1;
8869
Benjamin Peterson1365de72014-04-07 20:15:41 -04008870 exit:
8871 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008872 return ret;
8873}
8874
8875/* Fast path for ascii => ascii translation. Return 1 if the whole string
8876 was translated into writer, return 0 if the input string was partially
8877 translated into writer, raise an exception and return -1 on error. */
8878static int
8879unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008880 _PyUnicodeWriter *writer, int ignore,
8881 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882{
Victor Stinner872b2912014-04-05 14:27:07 +02008883 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008884 Py_ssize_t len;
8885 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008886 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008887
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 len = PyUnicode_GET_LENGTH(input);
8889
Victor Stinner872b2912014-04-05 14:27:07 +02008890 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891
8892 in = PyUnicode_1BYTE_DATA(input);
8893 end = in + len;
8894
8895 assert(PyUnicode_IS_ASCII(writer->buffer));
8896 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8897 out = PyUnicode_1BYTE_DATA(writer->buffer);
8898
Victor Stinner872b2912014-04-05 14:27:07 +02008899 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008901 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008903 int translate = unicode_fast_translate_lookup(mapping, ch,
8904 ascii_table);
8905 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008907 if (translate == 0)
8908 goto exit;
8909 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 }
Victor Stinner872b2912014-04-05 14:27:07 +02008911 if (ch2 == 0xfe) {
8912 if (ignore)
8913 continue;
8914 goto exit;
8915 }
8916 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008918 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919 }
Victor Stinner872b2912014-04-05 14:27:07 +02008920 res = 1;
8921
8922exit:
8923 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008924 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008925 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926}
8927
Victor Stinner3222da22015-10-01 22:07:32 +02008928static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929_PyUnicode_TranslateCharmap(PyObject *input,
8930 PyObject *mapping,
8931 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 Py_ssize_t size, i;
8936 int kind;
8937 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 _PyUnicodeWriter writer;
8939 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008940 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941 PyObject *errorHandler = NULL;
8942 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 PyErr_BadArgument();
8948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 if (PyUnicode_READY(input) == -1)
8952 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 kind = PyUnicode_KIND(input);
8955 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008957 if (size == 0)
8958 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008960 /* allocate enough for a simple 1:1 translation without
8961 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008962 _PyUnicodeWriter_Init(&writer);
8963 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Victor Stinner872b2912014-04-05 14:27:07 +02008966 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8967
Victor Stinner33798672016-03-01 21:59:58 +01008968 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008970 if (PyUnicode_IS_ASCII(input)) {
8971 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8972 if (res < 0) {
8973 _PyUnicodeWriter_Dealloc(&writer);
8974 return NULL;
8975 }
8976 if (res == 1)
8977 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978 }
Victor Stinner33798672016-03-01 21:59:58 +01008979 else {
8980 i = 0;
8981 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008985 int translate;
8986 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8987 Py_ssize_t newpos;
8988 /* startpos for collecting untranslatable chars */
8989 Py_ssize_t collstart;
8990 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Victor Stinner1194ea02014-04-04 19:37:40 +02008993 ch = PyUnicode_READ(kind, data, i);
8994 translate = charmaptranslate_output(ch, mapping, &writer);
8995 if (translate < 0)
8996 goto onError;
8997
8998 if (translate != 0) {
8999 /* it worked => adjust input pointer */
9000 ++i;
9001 continue;
9002 }
9003
9004 /* untranslatable character */
9005 collstart = i;
9006 collend = i+1;
9007
9008 /* find all untranslatable characters */
9009 while (collend < size) {
9010 PyObject *x;
9011 ch = PyUnicode_READ(kind, data, collend);
9012 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009013 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 Py_XDECREF(x);
9015 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 ++collend;
9018 }
9019
9020 if (ignore) {
9021 i = collend;
9022 }
9023 else {
9024 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9025 reason, input, &exc,
9026 collstart, collend, &newpos);
9027 if (repunicode == NULL)
9028 goto onError;
9029 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009030 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009032 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009033 Py_DECREF(repunicode);
9034 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009035 }
9036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009037 Py_XDECREF(exc);
9038 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009039 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009042 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043 Py_XDECREF(exc);
9044 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 return NULL;
9046}
9047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048/* Deprecated. Use PyUnicode_Translate instead. */
9049PyObject *
9050PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9051 Py_ssize_t size,
9052 PyObject *mapping,
9053 const char *errors)
9054{
Christian Heimes5f520f42012-09-11 14:03:25 +02009055 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009056 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (!unicode)
9058 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009059 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9060 Py_DECREF(unicode);
9061 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062}
9063
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064PyObject *
9065PyUnicode_Translate(PyObject *str,
9066 PyObject *mapping,
9067 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009069 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009070 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009071 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
Tim Petersced69f82003-09-16 20:30:58 +00009073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074PyObject *
9075_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9076{
9077 if (!PyUnicode_Check(unicode)) {
9078 PyErr_BadInternalCall();
9079 return NULL;
9080 }
9081 if (PyUnicode_READY(unicode) == -1)
9082 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009083 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 /* If the string is already ASCII, just return the same string */
9085 Py_INCREF(unicode);
9086 return unicode;
9087 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009088
9089 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9090 PyObject *result = PyUnicode_New(len, 127);
9091 if (result == NULL) {
9092 return NULL;
9093 }
9094
9095 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9096 int kind = PyUnicode_KIND(unicode);
9097 const void *data = PyUnicode_DATA(unicode);
9098 Py_ssize_t i;
9099 for (i = 0; i < len; ++i) {
9100 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9101 if (ch < 127) {
9102 out[i] = ch;
9103 }
9104 else if (Py_UNICODE_ISSPACE(ch)) {
9105 out[i] = ' ';
9106 }
9107 else {
9108 int decimal = Py_UNICODE_TODECIMAL(ch);
9109 if (decimal < 0) {
9110 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009111 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009112 _PyUnicode_LENGTH(result) = i + 1;
9113 break;
9114 }
9115 out[i] = '0' + decimal;
9116 }
9117 }
9118
INADA Naoki16dfca42018-07-14 12:06:43 +09009119 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009120 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121}
9122
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123PyObject *
9124PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9125 Py_ssize_t length)
9126{
Victor Stinnerf0124502011-11-21 23:12:56 +01009127 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009128 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009129 Py_UCS4 maxchar;
9130 enum PyUnicode_Kind kind;
9131 void *data;
9132
Victor Stinner99d7ad02012-02-22 13:37:39 +01009133 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009134 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009135 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 if (ch > 127) {
9137 int decimal = Py_UNICODE_TODECIMAL(ch);
9138 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009139 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009140 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009141 }
9142 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009143
9144 /* Copy to a new string */
9145 decimal = PyUnicode_New(length, maxchar);
9146 if (decimal == NULL)
9147 return decimal;
9148 kind = PyUnicode_KIND(decimal);
9149 data = PyUnicode_DATA(decimal);
9150 /* Iterate over code points */
9151 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009152 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009153 if (ch > 127) {
9154 int decimal = Py_UNICODE_TODECIMAL(ch);
9155 if (decimal >= 0)
9156 ch = '0' + decimal;
9157 }
9158 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009160 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009161}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009162/* --- Decimal Encoder ---------------------------------------------------- */
9163
Alexander Belopolsky40018472011-02-26 01:02:56 +00009164int
9165PyUnicode_EncodeDecimal(Py_UNICODE *s,
9166 Py_ssize_t length,
9167 char *output,
9168 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009170 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009171 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009172 enum PyUnicode_Kind kind;
9173 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174
9175 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 PyErr_BadArgument();
9177 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009178 }
9179
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009180 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009181 if (unicode == NULL)
9182 return -1;
9183
Victor Stinner42bf7752011-11-21 22:52:58 +01009184 kind = PyUnicode_KIND(unicode);
9185 data = PyUnicode_DATA(unicode);
9186
Victor Stinnerb84d7232011-11-22 01:50:07 +01009187 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009188 PyObject *exc;
9189 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009191 Py_ssize_t startpos;
9192
9193 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009194
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009197 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 decimal = Py_UNICODE_TODECIMAL(ch);
9201 if (decimal >= 0) {
9202 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
9206 if (0 < ch && ch < 256) {
9207 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009208 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009209 continue;
9210 }
Victor Stinner6345be92011-11-25 20:09:01 +01009211
Victor Stinner42bf7752011-11-21 22:52:58 +01009212 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009213 exc = NULL;
9214 raise_encode_exception(&exc, "decimal", unicode,
9215 startpos, startpos+1,
9216 "invalid decimal Unicode string");
9217 Py_XDECREF(exc);
9218 Py_DECREF(unicode);
9219 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220 }
9221 /* 0-terminate the output string */
9222 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009223 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009224 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009225}
9226
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227/* --- Helpers ------------------------------------------------------------ */
9228
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009229/* helper macro to fixup start/end slice values */
9230#define ADJUST_INDICES(start, end, len) \
9231 if (end > len) \
9232 end = len; \
9233 else if (end < 0) { \
9234 end += len; \
9235 if (end < 0) \
9236 end = 0; \
9237 } \
9238 if (start < 0) { \
9239 start += len; \
9240 if (start < 0) \
9241 start = 0; \
9242 }
9243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009245any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009247 Py_ssize_t end,
9248 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009250 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 void *buf1, *buf2;
9252 Py_ssize_t len1, len2, result;
9253
9254 kind1 = PyUnicode_KIND(s1);
9255 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 if (kind1 < kind2)
9257 return -1;
9258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 len1 = PyUnicode_GET_LENGTH(s1);
9260 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009261 ADJUST_INDICES(start, end, len1);
9262 if (end - start < len2)
9263 return -1;
9264
9265 buf1 = PyUnicode_DATA(s1);
9266 buf2 = PyUnicode_DATA(s2);
9267 if (len2 == 1) {
9268 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9269 result = findchar((const char *)buf1 + kind1*start,
9270 kind1, end - start, ch, direction);
9271 if (result == -1)
9272 return -1;
9273 else
9274 return start + result;
9275 }
9276
9277 if (kind2 != kind1) {
9278 buf2 = _PyUnicode_AsKind(s2, kind1);
9279 if (!buf2)
9280 return -2;
9281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282
Victor Stinner794d5672011-10-10 03:21:36 +02009283 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009284 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009285 case PyUnicode_1BYTE_KIND:
9286 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9287 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9288 else
9289 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9290 break;
9291 case PyUnicode_2BYTE_KIND:
9292 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9293 break;
9294 case PyUnicode_4BYTE_KIND:
9295 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9296 break;
9297 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009298 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009299 }
9300 }
9301 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009302 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009303 case PyUnicode_1BYTE_KIND:
9304 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9305 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 else
9307 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308 break;
9309 case PyUnicode_2BYTE_KIND:
9310 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9311 break;
9312 case PyUnicode_4BYTE_KIND:
9313 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9314 break;
9315 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009316 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 }
9319
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009320 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 PyMem_Free(buf2);
9322
9323 return result;
9324}
9325
9326Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009327_PyUnicode_InsertThousandsGrouping(
9328 PyObject *unicode, Py_ssize_t index,
9329 Py_ssize_t n_buffer,
9330 void *digits, Py_ssize_t n_digits,
9331 Py_ssize_t min_width,
9332 const char *grouping, PyObject *thousands_sep,
9333 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334{
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009336 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009337 Py_ssize_t thousands_sep_len;
9338 Py_ssize_t len;
9339
9340 if (unicode != NULL) {
9341 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009342 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009343 }
9344 else {
9345 kind = PyUnicode_1BYTE_KIND;
9346 data = NULL;
9347 }
9348 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9349 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9350 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9351 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009352 if (thousands_sep_kind < kind) {
9353 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9354 if (!thousands_sep_data)
9355 return -1;
9356 }
9357 else {
9358 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9359 if (!data)
9360 return -1;
9361 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 }
9363
Benjamin Petersonead6b532011-12-20 17:23:42 -06009364 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009368 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009371 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009373 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009379 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009381 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009385 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009387 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 break;
9389 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009390 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009392 if (unicode != NULL && thousands_sep_kind != kind) {
9393 if (thousands_sep_kind < kind)
9394 PyMem_Free(thousands_sep_data);
9395 else
9396 PyMem_Free(data);
9397 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 if (unicode == NULL) {
9399 *maxchar = 127;
9400 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009401 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009402 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009403 }
9404 }
9405 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406}
9407
9408
Alexander Belopolsky40018472011-02-26 01:02:56 +00009409Py_ssize_t
9410PyUnicode_Count(PyObject *str,
9411 PyObject *substr,
9412 Py_ssize_t start,
9413 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009415 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009416 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 void *buf1 = NULL, *buf2 = NULL;
9418 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 kind1 = PyUnicode_KIND(str);
9424 kind2 = PyUnicode_KIND(substr);
9425 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428 len1 = PyUnicode_GET_LENGTH(str);
9429 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009432 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 buf1 = PyUnicode_DATA(str);
9435 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009436 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009437 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009438 if (!buf2)
9439 goto onError;
9440 }
9441
9442 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009445 result = asciilib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
9449 else
9450 result = ucs1lib_count(
9451 ((Py_UCS1*)buf1) + start, end - start,
9452 buf2, len2, PY_SSIZE_T_MAX
9453 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 break;
9455 case PyUnicode_2BYTE_KIND:
9456 result = ucs2lib_count(
9457 ((Py_UCS2*)buf1) + start, end - start,
9458 buf2, len2, PY_SSIZE_T_MAX
9459 );
9460 break;
9461 case PyUnicode_4BYTE_KIND:
9462 result = ucs4lib_count(
9463 ((Py_UCS4*)buf1) + start, end - start,
9464 buf2, len2, PY_SSIZE_T_MAX
9465 );
9466 break;
9467 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009468 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009470
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009476 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 PyMem_Free(buf2);
9478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481Py_ssize_t
9482PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009484 Py_ssize_t start,
9485 Py_ssize_t end,
9486 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009490
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009491 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492}
9493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494Py_ssize_t
9495PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9496 Py_ssize_t start, Py_ssize_t end,
9497 int direction)
9498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009500 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 if (PyUnicode_READY(str) == -1)
9502 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009503 len = PyUnicode_GET_LENGTH(str);
9504 ADJUST_INDICES(start, end, len);
9505 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009508 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9509 kind, end-start, ch, direction);
9510 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009512 else
9513 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009517tailmatch(PyObject *self,
9518 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009519 Py_ssize_t start,
9520 Py_ssize_t end,
9521 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 int kind_self;
9524 int kind_sub;
9525 void *data_self;
9526 void *data_sub;
9527 Py_ssize_t offset;
9528 Py_ssize_t i;
9529 Py_ssize_t end_sub;
9530
9531 if (PyUnicode_READY(self) == -1 ||
9532 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9536 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009540 if (PyUnicode_GET_LENGTH(substring) == 0)
9541 return 1;
9542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 kind_self = PyUnicode_KIND(self);
9544 data_self = PyUnicode_DATA(self);
9545 kind_sub = PyUnicode_KIND(substring);
9546 data_sub = PyUnicode_DATA(substring);
9547 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9548
9549 if (direction > 0)
9550 offset = end;
9551 else
9552 offset = start;
9553
9554 if (PyUnicode_READ(kind_self, data_self, offset) ==
9555 PyUnicode_READ(kind_sub, data_sub, 0) &&
9556 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9557 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9558 /* If both are of the same kind, memcmp is sufficient */
9559 if (kind_self == kind_sub) {
9560 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 data_sub,
9563 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009564 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009566 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 else {
9568 /* We do not need to compare 0 and len(substring)-1 because
9569 the if statement above ensured already that they are equal
9570 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 for (i = 1; i < end_sub; ++i) {
9572 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9573 PyUnicode_READ(kind_sub, data_sub, i))
9574 return 0;
9575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 }
9579
9580 return 0;
9581}
9582
Alexander Belopolsky40018472011-02-26 01:02:56 +00009583Py_ssize_t
9584PyUnicode_Tailmatch(PyObject *str,
9585 PyObject *substr,
9586 Py_ssize_t start,
9587 Py_ssize_t end,
9588 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594}
9595
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009596static PyObject *
9597ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009599 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9600 char *resdata, *data = PyUnicode_DATA(self);
9601 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009602
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009603 res = PyUnicode_New(len, 127);
9604 if (res == NULL)
9605 return NULL;
9606 resdata = PyUnicode_DATA(res);
9607 if (lower)
9608 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610 _Py_bytes_upper(resdata, data, len);
9611 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612}
9613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009615handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617 Py_ssize_t j;
9618 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009619 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009621
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009622 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9623
9624 where ! is a negation and \p{xxx} is a character with property xxx.
9625 */
9626 for (j = i - 1; j >= 0; j--) {
9627 c = PyUnicode_READ(kind, data, j);
9628 if (!_PyUnicode_IsCaseIgnorable(c))
9629 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9632 if (final_sigma) {
9633 for (j = i + 1; j < length; j++) {
9634 c = PyUnicode_READ(kind, data, j);
9635 if (!_PyUnicode_IsCaseIgnorable(c))
9636 break;
9637 }
9638 final_sigma = j == length || !_PyUnicode_IsCased(c);
9639 }
9640 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643static int
9644lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9645 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009647 /* Obscure special case. */
9648 if (c == 0x3A3) {
9649 mapped[0] = handle_capital_sigma(kind, data, length, i);
9650 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655static Py_ssize_t
9656do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t i, k = 0;
9659 int n_res, j;
9660 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009661
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 c = PyUnicode_READ(kind, data, 0);
9663 n_res = _PyUnicode_ToUpperFull(c, mapped);
9664 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009665 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 for (i = 1; i < length; i++) {
9669 c = PyUnicode_READ(kind, data, i);
9670 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9671 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009672 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009674 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009675 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679static Py_ssize_t
9680do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9681 Py_ssize_t i, k = 0;
9682
9683 for (i = 0; i < length; i++) {
9684 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9685 int n_res, j;
9686 if (Py_UNICODE_ISUPPER(c)) {
9687 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9688 }
9689 else if (Py_UNICODE_ISLOWER(c)) {
9690 n_res = _PyUnicode_ToUpperFull(c, mapped);
9691 }
9692 else {
9693 n_res = 1;
9694 mapped[0] = c;
9695 }
9696 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009697 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 res[k++] = mapped[j];
9699 }
9700 }
9701 return k;
9702}
9703
9704static Py_ssize_t
9705do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9706 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 Py_ssize_t i, k = 0;
9709
9710 for (i = 0; i < length; i++) {
9711 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9712 int n_res, j;
9713 if (lower)
9714 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9715 else
9716 n_res = _PyUnicode_ToUpperFull(c, mapped);
9717 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009718 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 res[k++] = mapped[j];
9720 }
9721 }
9722 return k;
9723}
9724
9725static Py_ssize_t
9726do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9727{
9728 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9729}
9730
9731static Py_ssize_t
9732do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9733{
9734 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9735}
9736
Benjamin Petersone51757f2012-01-12 21:10:29 -05009737static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009738do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9739{
9740 Py_ssize_t i, k = 0;
9741
9742 for (i = 0; i < length; i++) {
9743 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9744 Py_UCS4 mapped[3];
9745 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9746 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009747 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009748 res[k++] = mapped[j];
9749 }
9750 }
9751 return k;
9752}
9753
9754static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009755do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9756{
9757 Py_ssize_t i, k = 0;
9758 int previous_is_cased;
9759
9760 previous_is_cased = 0;
9761 for (i = 0; i < length; i++) {
9762 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9763 Py_UCS4 mapped[3];
9764 int n_res, j;
9765
9766 if (previous_is_cased)
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 else
9769 n_res = _PyUnicode_ToTitleFull(c, mapped);
9770
9771 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009772 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009773 res[k++] = mapped[j];
9774 }
9775
9776 previous_is_cased = _PyUnicode_IsCased(c);
9777 }
9778 return k;
9779}
9780
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009781static PyObject *
9782case_operation(PyObject *self,
9783 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9784{
9785 PyObject *res = NULL;
9786 Py_ssize_t length, newlength = 0;
9787 int kind, outkind;
9788 void *data, *outdata;
9789 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9790
Benjamin Petersoneea48462012-01-16 14:28:50 -05009791 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009792
9793 kind = PyUnicode_KIND(self);
9794 data = PyUnicode_DATA(self);
9795 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009796 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009797 PyErr_SetString(PyExc_OverflowError, "string is too long");
9798 return NULL;
9799 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009800 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009801 if (tmp == NULL)
9802 return PyErr_NoMemory();
9803 newlength = perform(kind, data, length, tmp, &maxchar);
9804 res = PyUnicode_New(newlength, maxchar);
9805 if (res == NULL)
9806 goto leave;
9807 tmpend = tmp + newlength;
9808 outdata = PyUnicode_DATA(res);
9809 outkind = PyUnicode_KIND(res);
9810 switch (outkind) {
9811 case PyUnicode_1BYTE_KIND:
9812 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9813 break;
9814 case PyUnicode_2BYTE_KIND:
9815 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9816 break;
9817 case PyUnicode_4BYTE_KIND:
9818 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9819 break;
9820 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009821 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 }
9823 leave:
9824 PyMem_FREE(tmp);
9825 return res;
9826}
9827
Tim Peters8ce9f162004-08-27 01:49:32 +00009828PyObject *
9829PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009831 PyObject *res;
9832 PyObject *fseq;
9833 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009834 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009836 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009837 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009839 }
9840
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009841 /* NOTE: the following code can't call back into Python code,
9842 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009843 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009844
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009845 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009846 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009847 res = _PyUnicode_JoinArray(separator, items, seqlen);
9848 Py_DECREF(fseq);
9849 return res;
9850}
9851
9852PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009853_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009854{
9855 PyObject *res = NULL; /* the result */
9856 PyObject *sep = NULL;
9857 Py_ssize_t seplen;
9858 PyObject *item;
9859 Py_ssize_t sz, i, res_offset;
9860 Py_UCS4 maxchar;
9861 Py_UCS4 item_maxchar;
9862 int use_memcpy;
9863 unsigned char *res_data = NULL, *sep_data = NULL;
9864 PyObject *last_obj;
9865 unsigned int kind = 0;
9866
Tim Peters05eba1f2004-08-27 21:32:02 +00009867 /* If empty sequence, return u"". */
9868 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009869 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009871
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009873 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009874 if (seqlen == 1) {
9875 if (PyUnicode_CheckExact(items[0])) {
9876 res = items[0];
9877 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009878 return res;
9879 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009880 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009881 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009882 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009884 /* Set up sep and seplen */
9885 if (separator == NULL) {
9886 /* fall back to a blank space separator */
9887 sep = PyUnicode_FromOrdinal(' ');
9888 if (!sep)
9889 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009890 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009891 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009893 else {
9894 if (!PyUnicode_Check(separator)) {
9895 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02009896 "separator: expected str instance, %T found",
9897 separator);
Victor Stinneracf47b82011-10-06 12:32:37 +02009898 goto onError;
9899 }
9900 if (PyUnicode_READY(separator))
9901 goto onError;
9902 sep = separator;
9903 seplen = PyUnicode_GET_LENGTH(separator);
9904 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9905 /* inc refcount to keep this code path symmetric with the
9906 above case of a blank separator */
9907 Py_INCREF(sep);
9908 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009909 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009910 }
9911
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912 /* There are at least two things to join, or else we have a subclass
9913 * of str in the sequence.
9914 * Do a pre-pass to figure out the total amount of space we'll
9915 * need (sz), and see whether all argument are strings.
9916 */
9917 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009918#ifdef Py_DEBUG
9919 use_memcpy = 0;
9920#else
9921 use_memcpy = 1;
9922#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009923 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009924 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009925 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 if (!PyUnicode_Check(item)) {
9927 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +02009928 "sequence item %zd: expected str instance, %T found",
9929 i, item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 goto onError;
9931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 if (PyUnicode_READY(item) == -1)
9933 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009934 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009936 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009937 if (i != 0) {
9938 add_sz += seplen;
9939 }
9940 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009943 goto onError;
9944 }
Xiang Zhangb0541f42017-01-10 10:52:00 +08009945 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +02009946 if (use_memcpy && last_obj != NULL) {
9947 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9948 use_memcpy = 0;
9949 }
9950 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009951 }
Tim Petersced69f82003-09-16 20:30:58 +00009952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009954 if (res == NULL)
9955 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009956
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009957 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009958#ifdef Py_DEBUG
9959 use_memcpy = 0;
9960#else
9961 if (use_memcpy) {
9962 res_data = PyUnicode_1BYTE_DATA(res);
9963 kind = PyUnicode_KIND(res);
9964 if (seplen != 0)
9965 sep_data = PyUnicode_1BYTE_DATA(sep);
9966 }
9967#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009968 if (use_memcpy) {
9969 for (i = 0; i < seqlen; ++i) {
9970 Py_ssize_t itemlen;
9971 item = items[i];
9972
9973 /* Copy item, and maybe the separator. */
9974 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009975 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009976 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009977 kind * seplen);
9978 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009979 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009980
9981 itemlen = PyUnicode_GET_LENGTH(item);
9982 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +02009983 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +02009984 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 kind * itemlen);
9986 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009987 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009988 }
9989 assert(res_data == PyUnicode_1BYTE_DATA(res)
9990 + kind * PyUnicode_GET_LENGTH(res));
9991 }
9992 else {
9993 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9994 Py_ssize_t itemlen;
9995 item = items[i];
9996
9997 /* Copy item, and maybe the separator. */
9998 if (i && seplen != 0) {
9999 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10000 res_offset += seplen;
10001 }
10002
10003 itemlen = PyUnicode_GET_LENGTH(item);
10004 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010005 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010006 res_offset += itemlen;
10007 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010008 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010009 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010010 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010013 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010018 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 return NULL;
10020}
10021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022#define FILL(kind, data, value, start, length) \
10023 do { \
10024 Py_ssize_t i_ = 0; \
10025 assert(kind != PyUnicode_WCHAR_KIND); \
10026 switch ((kind)) { \
10027 case PyUnicode_1BYTE_KIND: { \
10028 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010029 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 break; \
10031 } \
10032 case PyUnicode_2BYTE_KIND: { \
10033 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10034 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10035 break; \
10036 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010037 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10039 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10040 break; \
10041 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010042 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 } \
10044 } while (0)
10045
Victor Stinnerd3f08822012-05-29 12:57:52 +020010046void
10047_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10048 Py_UCS4 fill_char)
10049{
10050 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10051 const void *data = PyUnicode_DATA(unicode);
10052 assert(PyUnicode_IS_READY(unicode));
10053 assert(unicode_modifiable(unicode));
10054 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10055 assert(start >= 0);
10056 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10057 FILL(kind, data, fill_char, start, length);
10058}
10059
Victor Stinner3fe55312012-01-04 00:33:50 +010010060Py_ssize_t
10061PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10062 Py_UCS4 fill_char)
10063{
10064 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010065
10066 if (!PyUnicode_Check(unicode)) {
10067 PyErr_BadInternalCall();
10068 return -1;
10069 }
10070 if (PyUnicode_READY(unicode) == -1)
10071 return -1;
10072 if (unicode_check_modifiable(unicode))
10073 return -1;
10074
Victor Stinnerd3f08822012-05-29 12:57:52 +020010075 if (start < 0) {
10076 PyErr_SetString(PyExc_IndexError, "string index out of range");
10077 return -1;
10078 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010079 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10080 PyErr_SetString(PyExc_ValueError,
10081 "fill character is bigger than "
10082 "the string maximum character");
10083 return -1;
10084 }
10085
10086 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10087 length = Py_MIN(maxlen, length);
10088 if (length <= 0)
10089 return 0;
10090
Victor Stinnerd3f08822012-05-29 12:57:52 +020010091 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010092 return length;
10093}
10094
Victor Stinner9310abb2011-10-05 00:59:23 +020010095static PyObject *
10096pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010097 Py_ssize_t left,
10098 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 PyObject *u;
10102 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010103 int kind;
10104 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105
10106 if (left < 0)
10107 left = 0;
10108 if (right < 0)
10109 right = 0;
10110
Victor Stinnerc4b49542011-12-11 22:44:26 +010010111 if (left == 0 && right == 0)
10112 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10115 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010116 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10117 return NULL;
10118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010120 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010122 if (!u)
10123 return NULL;
10124
10125 kind = PyUnicode_KIND(u);
10126 data = PyUnicode_DATA(u);
10127 if (left)
10128 FILL(kind, data, fill, 0, left);
10129 if (right)
10130 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010131 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010132 assert(_PyUnicode_CheckConsistency(u, 1));
10133 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134}
10135
Alexander Belopolsky40018472011-02-26 01:02:56 +000010136PyObject *
10137PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010141 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143
Benjamin Petersonead6b532011-12-20 17:23:42 -060010144 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010146 if (PyUnicode_IS_ASCII(string))
10147 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010148 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010149 PyUnicode_GET_LENGTH(string), keepends);
10150 else
10151 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010153 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 break;
10155 case PyUnicode_2BYTE_KIND:
10156 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010157 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 PyUnicode_GET_LENGTH(string), keepends);
10159 break;
10160 case PyUnicode_4BYTE_KIND:
10161 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010162 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 PyUnicode_GET_LENGTH(string), keepends);
10164 break;
10165 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010166 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169}
10170
Alexander Belopolsky40018472011-02-26 01:02:56 +000010171static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010172split(PyObject *self,
10173 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010174 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010176 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 void *buf1, *buf2;
10178 Py_ssize_t len1, len2;
10179 PyObject* out;
10180
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010182 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (PyUnicode_READY(self) == -1)
10185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010188 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 if (PyUnicode_IS_ASCII(self))
10191 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 PyUnicode_GET_LENGTH(self), maxcount
10194 );
10195 else
10196 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 PyUnicode_GET_LENGTH(self), maxcount
10199 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 case PyUnicode_2BYTE_KIND:
10201 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyUnicode_GET_LENGTH(self), maxcount
10204 );
10205 case PyUnicode_4BYTE_KIND:
10206 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 PyUnicode_GET_LENGTH(self), maxcount
10209 );
10210 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010211 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 }
10213
10214 if (PyUnicode_READY(substring) == -1)
10215 return NULL;
10216
10217 kind1 = PyUnicode_KIND(self);
10218 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 len1 = PyUnicode_GET_LENGTH(self);
10220 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010221 if (kind1 < kind2 || len1 < len2) {
10222 out = PyList_New(1);
10223 if (out == NULL)
10224 return NULL;
10225 Py_INCREF(self);
10226 PyList_SET_ITEM(out, 0, self);
10227 return out;
10228 }
10229 buf1 = PyUnicode_DATA(self);
10230 buf2 = PyUnicode_DATA(substring);
10231 if (kind2 != kind1) {
10232 buf2 = _PyUnicode_AsKind(substring, kind1);
10233 if (!buf2)
10234 return NULL;
10235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010237 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010239 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10240 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010242 else
10243 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 break;
10246 case PyUnicode_2BYTE_KIND:
10247 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 break;
10250 case PyUnicode_4BYTE_KIND:
10251 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 break;
10254 default:
10255 out = NULL;
10256 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010257 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 PyMem_Free(buf2);
10259 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260}
10261
Alexander Belopolsky40018472011-02-26 01:02:56 +000010262static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010263rsplit(PyObject *self,
10264 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010265 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010266{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010267 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 void *buf1, *buf2;
10269 Py_ssize_t len1, len2;
10270 PyObject* out;
10271
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010272 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010273 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (PyUnicode_READY(self) == -1)
10276 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010279 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 if (PyUnicode_IS_ASCII(self))
10282 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 PyUnicode_GET_LENGTH(self), maxcount
10285 );
10286 else
10287 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010289 PyUnicode_GET_LENGTH(self), maxcount
10290 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 case PyUnicode_2BYTE_KIND:
10292 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyUnicode_GET_LENGTH(self), maxcount
10295 );
10296 case PyUnicode_4BYTE_KIND:
10297 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 PyUnicode_GET_LENGTH(self), maxcount
10300 );
10301 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010302 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 }
10304
10305 if (PyUnicode_READY(substring) == -1)
10306 return NULL;
10307
10308 kind1 = PyUnicode_KIND(self);
10309 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 len1 = PyUnicode_GET_LENGTH(self);
10311 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010312 if (kind1 < kind2 || len1 < len2) {
10313 out = PyList_New(1);
10314 if (out == NULL)
10315 return NULL;
10316 Py_INCREF(self);
10317 PyList_SET_ITEM(out, 0, self);
10318 return out;
10319 }
10320 buf1 = PyUnicode_DATA(self);
10321 buf2 = PyUnicode_DATA(substring);
10322 if (kind2 != kind1) {
10323 buf2 = _PyUnicode_AsKind(substring, kind1);
10324 if (!buf2)
10325 return NULL;
10326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010328 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10331 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 else
10334 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 break;
10337 case PyUnicode_2BYTE_KIND:
10338 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 break;
10341 case PyUnicode_4BYTE_KIND:
10342 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 break;
10345 default:
10346 out = NULL;
10347 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010348 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 PyMem_Free(buf2);
10350 return out;
10351}
10352
10353static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010354anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10355 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010357 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010359 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10360 return asciilib_find(buf1, len1, buf2, len2, offset);
10361 else
10362 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 case PyUnicode_2BYTE_KIND:
10364 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10365 case PyUnicode_4BYTE_KIND:
10366 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10367 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010368 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369}
10370
10371static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10373 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010375 switch (kind) {
10376 case PyUnicode_1BYTE_KIND:
10377 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10378 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10379 else
10380 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10381 case PyUnicode_2BYTE_KIND:
10382 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10383 case PyUnicode_4BYTE_KIND:
10384 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10385 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010386 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010387}
10388
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010389static void
10390replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10391 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10392{
10393 int kind = PyUnicode_KIND(u);
10394 void *data = PyUnicode_DATA(u);
10395 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10396 if (kind == PyUnicode_1BYTE_KIND) {
10397 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10398 (Py_UCS1 *)data + len,
10399 u1, u2, maxcount);
10400 }
10401 else if (kind == PyUnicode_2BYTE_KIND) {
10402 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10403 (Py_UCS2 *)data + len,
10404 u1, u2, maxcount);
10405 }
10406 else {
10407 assert(kind == PyUnicode_4BYTE_KIND);
10408 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10409 (Py_UCS4 *)data + len,
10410 u1, u2, maxcount);
10411 }
10412}
10413
Alexander Belopolsky40018472011-02-26 01:02:56 +000010414static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415replace(PyObject *self, PyObject *str1,
10416 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 PyObject *u;
10419 char *sbuf = PyUnicode_DATA(self);
10420 char *buf1 = PyUnicode_DATA(str1);
10421 char *buf2 = PyUnicode_DATA(str2);
10422 int srelease = 0, release1 = 0, release2 = 0;
10423 int skind = PyUnicode_KIND(self);
10424 int kind1 = PyUnicode_KIND(str1);
10425 int kind2 = PyUnicode_KIND(str2);
10426 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10427 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10428 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010429 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010430 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431
10432 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010435 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436
Victor Stinner59de0ee2011-10-07 10:01:28 +020010437 if (str1 == str2)
10438 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439
Victor Stinner49a0a212011-10-12 23:46:10 +020010440 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010441 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10442 if (maxchar < maxchar_str1)
10443 /* substring too wide to be present */
10444 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010445 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10446 /* Replacing str1 with str2 may cause a maxchar reduction in the
10447 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010448 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010449 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010452 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010454 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010456 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010457 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010459
Victor Stinner69ed0f42013-04-09 21:48:24 +020010460 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010461 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010462 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010464 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010468
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010469 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10470 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010471 }
10472 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 int rkind = skind;
10474 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010475 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (kind1 < rkind) {
10478 /* widen substring */
10479 buf1 = _PyUnicode_AsKind(str1, rkind);
10480 if (!buf1) goto error;
10481 release1 = 1;
10482 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010483 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 if (i < 0)
10485 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (rkind > kind2) {
10487 /* widen replacement */
10488 buf2 = _PyUnicode_AsKind(str2, rkind);
10489 if (!buf2) goto error;
10490 release2 = 1;
10491 }
10492 else if (rkind < kind2) {
10493 /* widen self and buf1 */
10494 rkind = kind2;
10495 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010496 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 sbuf = _PyUnicode_AsKind(self, rkind);
10498 if (!sbuf) goto error;
10499 srelease = 1;
10500 buf1 = _PyUnicode_AsKind(str1, rkind);
10501 if (!buf1) goto error;
10502 release1 = 1;
10503 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010504 u = PyUnicode_New(slen, maxchar);
10505 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 assert(PyUnicode_KIND(u) == rkind);
10508 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010509
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010510 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010511 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010512 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010514 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010516
10517 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010518 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010519 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010520 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010521 if (i == -1)
10522 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010529 }
10530 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010532 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 int rkind = skind;
10534 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010537 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 buf1 = _PyUnicode_AsKind(str1, rkind);
10539 if (!buf1) goto error;
10540 release1 = 1;
10541 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010542 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010543 if (n == 0)
10544 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 buf2 = _PyUnicode_AsKind(str2, rkind);
10548 if (!buf2) goto error;
10549 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010552 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 rkind = kind2;
10554 sbuf = _PyUnicode_AsKind(self, rkind);
10555 if (!sbuf) goto error;
10556 srelease = 1;
10557 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010558 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 buf1 = _PyUnicode_AsKind(str1, rkind);
10560 if (!buf1) goto error;
10561 release1 = 1;
10562 }
10563 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10564 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010565 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 PyErr_SetString(PyExc_OverflowError,
10567 "replace string is too long");
10568 goto error;
10569 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010570 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010572 _Py_INCREF_UNICODE_EMPTY();
10573 if (!unicode_empty)
10574 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 u = unicode_empty;
10576 goto done;
10577 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010578 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 PyErr_SetString(PyExc_OverflowError,
10580 "replace string is too long");
10581 goto error;
10582 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 u = PyUnicode_New(new_size, maxchar);
10584 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 assert(PyUnicode_KIND(u) == rkind);
10587 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 ires = i = 0;
10589 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010590 while (n-- > 0) {
10591 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010593 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010594 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010595 if (j == -1)
10596 break;
10597 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010599 memcpy(res + rkind * ires,
10600 sbuf + rkind * i,
10601 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 }
10604 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010606 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010608 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010615 memcpy(res + rkind * ires,
10616 sbuf + rkind * i,
10617 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010618 }
10619 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 /* interleave */
10621 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010622 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010626 if (--n <= 0)
10627 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010628 memcpy(res + rkind * ires,
10629 sbuf + rkind * i,
10630 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 ires++;
10632 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010634 memcpy(res + rkind * ires,
10635 sbuf + rkind * i,
10636 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 }
10639
10640 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010641 unicode_adjust_maxchar(&u);
10642 if (u == NULL)
10643 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010645
10646 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (srelease)
10648 PyMem_FREE(sbuf);
10649 if (release1)
10650 PyMem_FREE(buf1);
10651 if (release2)
10652 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010653 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010655
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (srelease)
10659 PyMem_FREE(sbuf);
10660 if (release1)
10661 PyMem_FREE(buf1);
10662 if (release2)
10663 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010664 return unicode_result_unchanged(self);
10665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 error:
10667 if (srelease && sbuf)
10668 PyMem_FREE(sbuf);
10669 if (release1 && buf1)
10670 PyMem_FREE(buf1);
10671 if (release2 && buf2)
10672 PyMem_FREE(buf2);
10673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674}
10675
10676/* --- Unicode Object Methods --------------------------------------------- */
10677
INADA Naoki3ae20562017-01-16 20:41:20 +090010678/*[clinic input]
10679str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680
INADA Naoki3ae20562017-01-16 20:41:20 +090010681Return a version of the string where each word is titlecased.
10682
10683More specifically, words start with uppercased characters and all remaining
10684cased characters have lower case.
10685[clinic start generated code]*/
10686
10687static PyObject *
10688unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010689/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010691 if (PyUnicode_READY(self) == -1)
10692 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010693 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694}
10695
INADA Naoki3ae20562017-01-16 20:41:20 +090010696/*[clinic input]
10697str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698
INADA Naoki3ae20562017-01-16 20:41:20 +090010699Return a capitalized version of the string.
10700
10701More specifically, make the first character have upper case and the rest lower
10702case.
10703[clinic start generated code]*/
10704
10705static PyObject *
10706unicode_capitalize_impl(PyObject *self)
10707/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010709 if (PyUnicode_READY(self) == -1)
10710 return NULL;
10711 if (PyUnicode_GET_LENGTH(self) == 0)
10712 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010713 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714}
10715
INADA Naoki3ae20562017-01-16 20:41:20 +090010716/*[clinic input]
10717str.casefold as unicode_casefold
10718
10719Return a version of the string suitable for caseless comparisons.
10720[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010721
10722static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010723unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010724/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010725{
10726 if (PyUnicode_READY(self) == -1)
10727 return NULL;
10728 if (PyUnicode_IS_ASCII(self))
10729 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010730 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010731}
10732
10733
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010734/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010735
10736static int
10737convert_uc(PyObject *obj, void *addr)
10738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010740
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010741 if (!PyUnicode_Check(obj)) {
10742 PyErr_Format(PyExc_TypeError,
10743 "The fill character must be a unicode character, "
Victor Stinner886483e2018-09-07 18:00:58 +020010744 "not %T", obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745 return 0;
10746 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010747 if (PyUnicode_READY(obj) < 0)
10748 return 0;
10749 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 return 0;
10753 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010754 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010755 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010756}
10757
INADA Naoki3ae20562017-01-16 20:41:20 +090010758/*[clinic input]
10759str.center as unicode_center
10760
10761 width: Py_ssize_t
10762 fillchar: Py_UCS4 = ' '
10763 /
10764
10765Return a centered string of length width.
10766
10767Padding is done using the specified fill character (default is a space).
10768[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769
10770static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010771unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10772/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010774 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775
Benjamin Petersonbac79492012-01-14 13:34:47 -050010776 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 return NULL;
10778
Victor Stinnerc4b49542011-12-11 22:44:26 +010010779 if (PyUnicode_GET_LENGTH(self) >= width)
10780 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Victor Stinnerc4b49542011-12-11 22:44:26 +010010782 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783 left = marg / 2 + (marg & width & 1);
10784
Victor Stinner9310abb2011-10-05 00:59:23 +020010785 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786}
10787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788/* This function assumes that str1 and str2 are readied by the caller. */
10789
Marc-André Lemburge5034372000-08-08 08:04:29 +000010790static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010791unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010792{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010793#define COMPARE(TYPE1, TYPE2) \
10794 do { \
10795 TYPE1* p1 = (TYPE1 *)data1; \
10796 TYPE2* p2 = (TYPE2 *)data2; \
10797 TYPE1* end = p1 + len; \
10798 Py_UCS4 c1, c2; \
10799 for (; p1 != end; p1++, p2++) { \
10800 c1 = *p1; \
10801 c2 = *p2; \
10802 if (c1 != c2) \
10803 return (c1 < c2) ? -1 : 1; \
10804 } \
10805 } \
10806 while (0)
10807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 int kind1, kind2;
10809 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010810 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 kind1 = PyUnicode_KIND(str1);
10813 kind2 = PyUnicode_KIND(str2);
10814 data1 = PyUnicode_DATA(str1);
10815 data2 = PyUnicode_DATA(str2);
10816 len1 = PyUnicode_GET_LENGTH(str1);
10817 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010818 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010819
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010820 switch(kind1) {
10821 case PyUnicode_1BYTE_KIND:
10822 {
10823 switch(kind2) {
10824 case PyUnicode_1BYTE_KIND:
10825 {
10826 int cmp = memcmp(data1, data2, len);
10827 /* normalize result of memcmp() into the range [-1; 1] */
10828 if (cmp < 0)
10829 return -1;
10830 if (cmp > 0)
10831 return 1;
10832 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010833 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010834 case PyUnicode_2BYTE_KIND:
10835 COMPARE(Py_UCS1, Py_UCS2);
10836 break;
10837 case PyUnicode_4BYTE_KIND:
10838 COMPARE(Py_UCS1, Py_UCS4);
10839 break;
10840 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010841 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010842 }
10843 break;
10844 }
10845 case PyUnicode_2BYTE_KIND:
10846 {
10847 switch(kind2) {
10848 case PyUnicode_1BYTE_KIND:
10849 COMPARE(Py_UCS2, Py_UCS1);
10850 break;
10851 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010852 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010853 COMPARE(Py_UCS2, Py_UCS2);
10854 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010855 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 case PyUnicode_4BYTE_KIND:
10857 COMPARE(Py_UCS2, Py_UCS4);
10858 break;
10859 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010860 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861 }
10862 break;
10863 }
10864 case PyUnicode_4BYTE_KIND:
10865 {
10866 switch(kind2) {
10867 case PyUnicode_1BYTE_KIND:
10868 COMPARE(Py_UCS4, Py_UCS1);
10869 break;
10870 case PyUnicode_2BYTE_KIND:
10871 COMPARE(Py_UCS4, Py_UCS2);
10872 break;
10873 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010874 {
10875#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10876 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10877 /* normalize result of wmemcmp() into the range [-1; 1] */
10878 if (cmp < 0)
10879 return -1;
10880 if (cmp > 0)
10881 return 1;
10882#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010884#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010886 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010888 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010889 }
10890 break;
10891 }
10892 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010893 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010894 }
10895
Victor Stinner770e19e2012-10-04 22:59:45 +020010896 if (len1 == len2)
10897 return 0;
10898 if (len1 < len2)
10899 return -1;
10900 else
10901 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902
10903#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010904}
10905
Benjamin Peterson621b4302016-09-09 13:54:34 -070010906static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010907unicode_compare_eq(PyObject *str1, PyObject *str2)
10908{
10909 int kind;
10910 void *data1, *data2;
10911 Py_ssize_t len;
10912 int cmp;
10913
Victor Stinnere5567ad2012-10-23 02:48:49 +020010914 len = PyUnicode_GET_LENGTH(str1);
10915 if (PyUnicode_GET_LENGTH(str2) != len)
10916 return 0;
10917 kind = PyUnicode_KIND(str1);
10918 if (PyUnicode_KIND(str2) != kind)
10919 return 0;
10920 data1 = PyUnicode_DATA(str1);
10921 data2 = PyUnicode_DATA(str2);
10922
10923 cmp = memcmp(data1, data2, len * kind);
10924 return (cmp == 0);
10925}
10926
10927
Alexander Belopolsky40018472011-02-26 01:02:56 +000010928int
10929PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10932 if (PyUnicode_READY(left) == -1 ||
10933 PyUnicode_READY(right) == -1)
10934 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010935
10936 /* a string is equal to itself */
10937 if (left == right)
10938 return 0;
10939
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010940 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010942 PyErr_Format(PyExc_TypeError,
10943 "Can't compare %.100s and %.100s",
10944 left->ob_type->tp_name,
10945 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946 return -1;
10947}
10948
Martin v. Löwis5b222132007-06-10 09:51:05 +000010949int
10950PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 Py_ssize_t i;
10953 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010955 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956
Victor Stinner910337b2011-10-03 03:20:16 +020010957 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010958 if (!PyUnicode_IS_READY(uni)) {
10959 const wchar_t *ws = _PyUnicode_WSTR(uni);
10960 /* Compare Unicode string and source character set string */
10961 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10962 if (chr != ustr[i])
10963 return (chr < ustr[i]) ? -1 : 1;
10964 }
10965 /* This check keeps Python strings that end in '\0' from comparing equal
10966 to C strings identical up to that point. */
10967 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10968 return 1; /* uni is longer */
10969 if (ustr[i])
10970 return -1; /* str is longer */
10971 return 0;
10972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010974 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010975 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010976 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010977 size_t len, len2 = strlen(str);
10978 int cmp;
10979
10980 len = Py_MIN(len1, len2);
10981 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010982 if (cmp != 0) {
10983 if (cmp < 0)
10984 return -1;
10985 else
10986 return 1;
10987 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010988 if (len1 > len2)
10989 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010990 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010991 return -1; /* str is longer */
10992 return 0;
10993 }
10994 else {
10995 void *data = PyUnicode_DATA(uni);
10996 /* Compare Unicode string and source character set string */
10997 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010998 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010999 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11000 /* This check keeps Python strings that end in '\0' from comparing equal
11001 to C strings identical up to that point. */
11002 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11003 return 1; /* uni is longer */
11004 if (str[i])
11005 return -1; /* str is longer */
11006 return 0;
11007 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011008}
11009
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011010static int
11011non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11012{
11013 size_t i, len;
11014 const wchar_t *p;
11015 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11016 if (strlen(str) != len)
11017 return 0;
11018 p = _PyUnicode_WSTR(unicode);
11019 assert(p);
11020 for (i = 0; i < len; i++) {
11021 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011022 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011023 return 0;
11024 }
11025 return 1;
11026}
11027
11028int
11029_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11030{
11031 size_t len;
11032 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011033 assert(str);
11034#ifndef NDEBUG
11035 for (const char *p = str; *p; p++) {
11036 assert((unsigned char)*p < 128);
11037 }
11038#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011039 if (PyUnicode_READY(unicode) == -1) {
11040 /* Memory error or bad data */
11041 PyErr_Clear();
11042 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11043 }
11044 if (!PyUnicode_IS_ASCII(unicode))
11045 return 0;
11046 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11047 return strlen(str) == len &&
11048 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11049}
11050
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011051int
11052_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11053{
11054 PyObject *right_uni;
11055 Py_hash_t hash;
11056
11057 assert(_PyUnicode_CHECK(left));
11058 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011059#ifndef NDEBUG
11060 for (const char *p = right->string; *p; p++) {
11061 assert((unsigned char)*p < 128);
11062 }
11063#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011064
11065 if (PyUnicode_READY(left) == -1) {
11066 /* memory error or bad data */
11067 PyErr_Clear();
11068 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11069 }
11070
11071 if (!PyUnicode_IS_ASCII(left))
11072 return 0;
11073
11074 right_uni = _PyUnicode_FromId(right); /* borrowed */
11075 if (right_uni == NULL) {
11076 /* memory error or bad data */
11077 PyErr_Clear();
11078 return _PyUnicode_EqualToASCIIString(left, right->string);
11079 }
11080
11081 if (left == right_uni)
11082 return 1;
11083
11084 if (PyUnicode_CHECK_INTERNED(left))
11085 return 0;
11086
INADA Naoki7cc95f52018-01-28 02:07:09 +090011087 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011088 hash = _PyUnicode_HASH(left);
11089 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11090 return 0;
11091
11092 return unicode_compare_eq(left, right_uni);
11093}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011094
Alexander Belopolsky40018472011-02-26 01:02:56 +000011095PyObject *
11096PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011097{
11098 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011099
Victor Stinnere5567ad2012-10-23 02:48:49 +020011100 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11101 Py_RETURN_NOTIMPLEMENTED;
11102
11103 if (PyUnicode_READY(left) == -1 ||
11104 PyUnicode_READY(right) == -1)
11105 return NULL;
11106
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011107 if (left == right) {
11108 switch (op) {
11109 case Py_EQ:
11110 case Py_LE:
11111 case Py_GE:
11112 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011113 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011114 case Py_NE:
11115 case Py_LT:
11116 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011117 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011118 default:
11119 PyErr_BadArgument();
11120 return NULL;
11121 }
11122 }
11123 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011124 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011125 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011126 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011127 }
11128 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011129 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011130 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011131 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011132}
11133
Alexander Belopolsky40018472011-02-26 01:02:56 +000011134int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011135_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11136{
11137 return unicode_eq(aa, bb);
11138}
11139
11140int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011141PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011142{
Victor Stinner77282cb2013-04-14 19:22:47 +020011143 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 void *buf1, *buf2;
11145 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011146 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011147
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +020011150 "'in <string>' requires string as left operand, not %T",
11151 substr);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011153 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011154 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011156 if (ensure_unicode(str) < 0)
11157 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011160 kind2 = PyUnicode_KIND(substr);
11161 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011162 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011164 len2 = PyUnicode_GET_LENGTH(substr);
11165 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011166 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011167 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011168 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011169 if (len2 == 1) {
11170 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11171 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011172 return result;
11173 }
11174 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011175 buf2 = _PyUnicode_AsKind(substr, kind1);
11176 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011177 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179
Victor Stinner77282cb2013-04-14 19:22:47 +020011180 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 case PyUnicode_1BYTE_KIND:
11182 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11183 break;
11184 case PyUnicode_2BYTE_KIND:
11185 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11186 break;
11187 case PyUnicode_4BYTE_KIND:
11188 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11189 break;
11190 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011191 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193
Victor Stinner77282cb2013-04-14 19:22:47 +020011194 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 PyMem_Free(buf2);
11196
Guido van Rossum403d68b2000-03-13 15:55:09 +000011197 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011198}
11199
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200/* Concat to string or Unicode object giving a new Unicode object. */
11201
Alexander Belopolsky40018472011-02-26 01:02:56 +000011202PyObject *
11203PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011205 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011206 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011207 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011209 if (ensure_unicode(left) < 0)
11210 return NULL;
11211
11212 if (!PyUnicode_Check(right)) {
11213 PyErr_Format(PyExc_TypeError,
11214 "can only concatenate str (not \"%.200s\") to str",
11215 right->ob_type->tp_name);
11216 return NULL;
11217 }
11218 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
11221 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 if (left == unicode_empty)
11223 return PyUnicode_FromObject(right);
11224 if (right == unicode_empty)
11225 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 left_len = PyUnicode_GET_LENGTH(left);
11228 right_len = PyUnicode_GET_LENGTH(right);
11229 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011230 PyErr_SetString(PyExc_OverflowError,
11231 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011233 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011235
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11237 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011238 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011241 result = PyUnicode_New(new_len, maxchar);
11242 if (result == NULL)
11243 return NULL;
11244 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11245 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11246 assert(_PyUnicode_CheckConsistency(result, 1));
11247 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248}
11249
Walter Dörwald1ab83302007-05-18 17:15:44 +000011250void
Victor Stinner23e56682011-10-03 03:54:37 +020011251PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011252{
Victor Stinner23e56682011-10-03 03:54:37 +020011253 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011254 Py_UCS4 maxchar, maxchar2;
11255 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011256
11257 if (p_left == NULL) {
11258 if (!PyErr_Occurred())
11259 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011260 return;
11261 }
Victor Stinner23e56682011-10-03 03:54:37 +020011262 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011263 if (right == NULL || left == NULL
11264 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011265 if (!PyErr_Occurred())
11266 PyErr_BadInternalCall();
11267 goto error;
11268 }
11269
Benjamin Petersonbac79492012-01-14 13:34:47 -050011270 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011271 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011272 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011273 goto error;
11274
Victor Stinner488fa492011-12-12 00:01:39 +010011275 /* Shortcuts */
11276 if (left == unicode_empty) {
11277 Py_DECREF(left);
11278 Py_INCREF(right);
11279 *p_left = right;
11280 return;
11281 }
11282 if (right == unicode_empty)
11283 return;
11284
11285 left_len = PyUnicode_GET_LENGTH(left);
11286 right_len = PyUnicode_GET_LENGTH(right);
11287 if (left_len > PY_SSIZE_T_MAX - right_len) {
11288 PyErr_SetString(PyExc_OverflowError,
11289 "strings are too large to concat");
11290 goto error;
11291 }
11292 new_len = left_len + right_len;
11293
11294 if (unicode_modifiable(left)
11295 && PyUnicode_CheckExact(right)
11296 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011297 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11298 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011299 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011300 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011301 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11302 {
11303 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011304 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011305 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011306
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011307 /* copy 'right' into the newly allocated area of 'left' */
11308 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011309 }
Victor Stinner488fa492011-12-12 00:01:39 +010011310 else {
11311 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11312 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011313 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011314
Victor Stinner488fa492011-12-12 00:01:39 +010011315 /* Concat the two Unicode strings */
11316 res = PyUnicode_New(new_len, maxchar);
11317 if (res == NULL)
11318 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011319 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11320 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011321 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011322 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011323 }
11324 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011325 return;
11326
11327error:
Victor Stinner488fa492011-12-12 00:01:39 +010011328 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011329}
11330
11331void
11332PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11333{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011334 PyUnicode_Append(pleft, right);
11335 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011336}
11337
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011338/*
11339Wraps stringlib_parse_args_finds() and additionally ensures that the
11340first argument is a unicode object.
11341*/
11342
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011343static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011344parse_args_finds_unicode(const char * function_name, PyObject *args,
11345 PyObject **substring,
11346 Py_ssize_t *start, Py_ssize_t *end)
11347{
11348 if(stringlib_parse_args_finds(function_name, args, substring,
11349 start, end)) {
11350 if (ensure_unicode(*substring) < 0)
11351 return 0;
11352 return 1;
11353 }
11354 return 0;
11355}
11356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011360Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011361string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011362interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
11364static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011365unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011367 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011368 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011369 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011371 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 void *buf1, *buf2;
11373 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011375 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 kind1 = PyUnicode_KIND(self);
11379 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011380 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011381 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 len1 = PyUnicode_GET_LENGTH(self);
11384 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011386 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011387 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011388
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011389 buf1 = PyUnicode_DATA(self);
11390 buf2 = PyUnicode_DATA(substring);
11391 if (kind2 != kind1) {
11392 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011393 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011394 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011395 }
11396 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 case PyUnicode_1BYTE_KIND:
11398 iresult = ucs1lib_count(
11399 ((Py_UCS1*)buf1) + start, end - start,
11400 buf2, len2, PY_SSIZE_T_MAX
11401 );
11402 break;
11403 case PyUnicode_2BYTE_KIND:
11404 iresult = ucs2lib_count(
11405 ((Py_UCS2*)buf1) + start, end - start,
11406 buf2, len2, PY_SSIZE_T_MAX
11407 );
11408 break;
11409 case PyUnicode_4BYTE_KIND:
11410 iresult = ucs4lib_count(
11411 ((Py_UCS4*)buf1) + start, end - start,
11412 buf2, len2, PY_SSIZE_T_MAX
11413 );
11414 break;
11415 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011416 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 }
11418
11419 result = PyLong_FromSsize_t(iresult);
11420
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011421 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 return result;
11425}
11426
INADA Naoki3ae20562017-01-16 20:41:20 +090011427/*[clinic input]
11428str.encode as unicode_encode
11429
11430 encoding: str(c_default="NULL") = 'utf-8'
11431 The encoding in which to encode the string.
11432 errors: str(c_default="NULL") = 'strict'
11433 The error handling scheme to use for encoding errors.
11434 The default is 'strict' meaning that encoding errors raise a
11435 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11436 'xmlcharrefreplace' as well as any other name registered with
11437 codecs.register_error that can handle UnicodeEncodeErrors.
11438
11439Encode the string using the codec registered for encoding.
11440[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011443unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011444/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011446 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011447}
11448
INADA Naoki3ae20562017-01-16 20:41:20 +090011449/*[clinic input]
11450str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
INADA Naoki3ae20562017-01-16 20:41:20 +090011452 tabsize: int = 8
11453
11454Return a copy where all tab characters are expanded using spaces.
11455
11456If tabsize is not given, a tab size of 8 characters is assumed.
11457[clinic start generated code]*/
11458
11459static PyObject *
11460unicode_expandtabs_impl(PyObject *self, int tabsize)
11461/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 Py_ssize_t i, j, line_pos, src_len, incr;
11464 Py_UCS4 ch;
11465 PyObject *u;
11466 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011468 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Antoine Pitrou22425222011-10-04 19:10:51 +020011470 if (PyUnicode_READY(self) == -1)
11471 return NULL;
11472
Thomas Wouters7e474022000-07-16 12:04:32 +000011473 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 src_len = PyUnicode_GET_LENGTH(self);
11475 i = j = line_pos = 0;
11476 kind = PyUnicode_KIND(self);
11477 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011478 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011479 for (; i < src_len; i++) {
11480 ch = PyUnicode_READ(kind, src_data, i);
11481 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011482 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011486 goto overflow;
11487 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011489 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011493 goto overflow;
11494 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011496 if (ch == '\n' || ch == '\r')
11497 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011500 if (!found)
11501 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011502
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011504 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 if (!u)
11506 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011507 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Antoine Pitroue71d5742011-10-04 15:55:09 +020011509 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Antoine Pitroue71d5742011-10-04 15:55:09 +020011511 for (; i < src_len; i++) {
11512 ch = PyUnicode_READ(kind, src_data, i);
11513 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011515 incr = tabsize - (line_pos % tabsize);
11516 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011517 FILL(kind, dest_data, ' ', j, incr);
11518 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011520 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011522 line_pos++;
11523 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011524 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011525 if (ch == '\n' || ch == '\r')
11526 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011528 }
11529 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011530 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011531
Antoine Pitroue71d5742011-10-04 15:55:09 +020011532 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011533 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535}
11536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011537PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539\n\
11540Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011541such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542arguments start and end are interpreted as in slice notation.\n\
11543\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011544Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
11546static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011549 /* initialize variables to prevent gcc warning */
11550 PyObject *substring = NULL;
11551 Py_ssize_t start = 0;
11552 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011553 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011555 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011558 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011561 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 if (result == -2)
11564 return NULL;
11565
Christian Heimes217cfd12007-12-02 14:31:20 +000011566 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567}
11568
11569static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011570unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011572 void *data;
11573 enum PyUnicode_Kind kind;
11574 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011575
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011576 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011577 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011579 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011580 if (PyUnicode_READY(self) == -1) {
11581 return NULL;
11582 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011583 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11584 PyErr_SetString(PyExc_IndexError, "string index out of range");
11585 return NULL;
11586 }
11587 kind = PyUnicode_KIND(self);
11588 data = PyUnicode_DATA(self);
11589 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011590 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Guido van Rossumc2504932007-09-18 19:42:40 +000011593/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011594 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011595static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011596unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597{
Guido van Rossumc2504932007-09-18 19:42:40 +000011598 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011599 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011600
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011601#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011602 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011603#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 if (_PyUnicode_HASH(self) != -1)
11605 return _PyUnicode_HASH(self);
11606 if (PyUnicode_READY(self) == -1)
11607 return -1;
11608 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011609 /*
11610 We make the hash of the empty string be 0, rather than using
11611 (prefix ^ suffix), since this slightly obfuscates the hash secret
11612 */
11613 if (len == 0) {
11614 _PyUnicode_HASH(self) = 0;
11615 return 0;
11616 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011617 x = _Py_HashBytes(PyUnicode_DATA(self),
11618 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011620 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
oldkaa0735f2018-02-02 16:52:55 +080011626Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011627such that sub is contained within S[start:end]. Optional\n\
11628arguments start and end are interpreted as in slice notation.\n\
11629\n\
11630Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011635 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011636 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011637 PyObject *substring = NULL;
11638 Py_ssize_t start = 0;
11639 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011641 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011647 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (result == -2)
11650 return NULL;
11651
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 if (result < 0) {
11653 PyErr_SetString(PyExc_ValueError, "substring not found");
11654 return NULL;
11655 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011656
Christian Heimes217cfd12007-12-02 14:31:20 +000011657 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
INADA Naoki3ae20562017-01-16 20:41:20 +090011660/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011661str.isascii as unicode_isascii
11662
11663Return True if all characters in the string are ASCII, False otherwise.
11664
11665ASCII characters have code points in the range U+0000-U+007F.
11666Empty string is ASCII too.
11667[clinic start generated code]*/
11668
11669static PyObject *
11670unicode_isascii_impl(PyObject *self)
11671/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11672{
11673 if (PyUnicode_READY(self) == -1) {
11674 return NULL;
11675 }
11676 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11677}
11678
11679/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011680str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681
INADA Naoki3ae20562017-01-16 20:41:20 +090011682Return True if the string is a lowercase string, False otherwise.
11683
11684A string is lowercase if all cased characters in the string are lowercase and
11685there is at least one cased character in the string.
11686[clinic start generated code]*/
11687
11688static PyObject *
11689unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011690/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 Py_ssize_t i, length;
11693 int kind;
11694 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 int cased;
11696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (PyUnicode_READY(self) == -1)
11698 return NULL;
11699 length = PyUnicode_GET_LENGTH(self);
11700 kind = PyUnicode_KIND(self);
11701 data = PyUnicode_DATA(self);
11702
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (length == 1)
11705 return PyBool_FromLong(
11706 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011708 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011710 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 for (i = 0; i < length; i++) {
11714 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011715
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011717 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 else if (!cased && Py_UNICODE_ISLOWER(ch))
11719 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011721 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722}
11723
INADA Naoki3ae20562017-01-16 20:41:20 +090011724/*[clinic input]
11725str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
INADA Naoki3ae20562017-01-16 20:41:20 +090011727Return True if the string is an uppercase string, False otherwise.
11728
11729A string is uppercase if all cased characters in the string are uppercase and
11730there is at least one cased character in the string.
11731[clinic start generated code]*/
11732
11733static PyObject *
11734unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011735/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 Py_ssize_t i, length;
11738 int kind;
11739 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 int cased;
11741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 if (PyUnicode_READY(self) == -1)
11743 return NULL;
11744 length = PyUnicode_GET_LENGTH(self);
11745 kind = PyUnicode_KIND(self);
11746 data = PyUnicode_DATA(self);
11747
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (length == 1)
11750 return PyBool_FromLong(
11751 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011753 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011755 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011756
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 for (i = 0; i < length; i++) {
11759 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011760
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011762 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 else if (!cased && Py_UNICODE_ISUPPER(ch))
11764 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011766 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767}
11768
INADA Naoki3ae20562017-01-16 20:41:20 +090011769/*[clinic input]
11770str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
INADA Naoki3ae20562017-01-16 20:41:20 +090011772Return True if the string is a title-cased string, False otherwise.
11773
11774In a title-cased string, upper- and title-case characters may only
11775follow uncased characters and lowercase characters only cased ones.
11776[clinic start generated code]*/
11777
11778static PyObject *
11779unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011780/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 Py_ssize_t i, length;
11783 int kind;
11784 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 int cased, previous_is_cased;
11786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 if (PyUnicode_READY(self) == -1)
11788 return NULL;
11789 length = PyUnicode_GET_LENGTH(self);
11790 kind = PyUnicode_KIND(self);
11791 data = PyUnicode_DATA(self);
11792
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (length == 1) {
11795 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11796 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11797 (Py_UNICODE_ISUPPER(ch) != 0));
11798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011800 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011802 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 cased = 0;
11805 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 for (i = 0; i < length; i++) {
11807 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011808
Benjamin Peterson29060642009-01-31 22:14:21 +000011809 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11810 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011811 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 previous_is_cased = 1;
11813 cased = 1;
11814 }
11815 else if (Py_UNICODE_ISLOWER(ch)) {
11816 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011817 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 previous_is_cased = 1;
11819 cased = 1;
11820 }
11821 else
11822 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011824 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825}
11826
INADA Naoki3ae20562017-01-16 20:41:20 +090011827/*[clinic input]
11828str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
INADA Naoki3ae20562017-01-16 20:41:20 +090011830Return True if the string is a whitespace string, False otherwise.
11831
11832A string is whitespace if all characters in the string are whitespace and there
11833is at least one character in the string.
11834[clinic start generated code]*/
11835
11836static PyObject *
11837unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011838/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 Py_ssize_t i, length;
11841 int kind;
11842 void *data;
11843
11844 if (PyUnicode_READY(self) == -1)
11845 return NULL;
11846 length = PyUnicode_GET_LENGTH(self);
11847 kind = PyUnicode_KIND(self);
11848 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (length == 1)
11852 return PyBool_FromLong(
11853 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011855 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011857 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 for (i = 0; i < length; i++) {
11860 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011861 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011862 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011864 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865}
11866
INADA Naoki3ae20562017-01-16 20:41:20 +090011867/*[clinic input]
11868str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011869
INADA Naoki3ae20562017-01-16 20:41:20 +090011870Return True if the string is an alphabetic string, False otherwise.
11871
11872A string is alphabetic if all characters in the string are alphabetic and there
11873is at least one character in the string.
11874[clinic start generated code]*/
11875
11876static PyObject *
11877unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011878/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 Py_ssize_t i, length;
11881 int kind;
11882 void *data;
11883
11884 if (PyUnicode_READY(self) == -1)
11885 return NULL;
11886 length = PyUnicode_GET_LENGTH(self);
11887 kind = PyUnicode_KIND(self);
11888 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011889
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011890 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 if (length == 1)
11892 return PyBool_FromLong(
11893 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011894
11895 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011897 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 for (i = 0; i < length; i++) {
11900 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011901 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011902 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011903 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011904}
11905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906/*[clinic input]
11907str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011908
INADA Naoki3ae20562017-01-16 20:41:20 +090011909Return True if the string is an alpha-numeric string, False otherwise.
11910
11911A string is alpha-numeric if all characters in the string are alpha-numeric and
11912there is at least one character in the string.
11913[clinic start generated code]*/
11914
11915static PyObject *
11916unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011917/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 int kind;
11920 void *data;
11921 Py_ssize_t len, i;
11922
11923 if (PyUnicode_READY(self) == -1)
11924 return NULL;
11925
11926 kind = PyUnicode_KIND(self);
11927 data = PyUnicode_DATA(self);
11928 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011929
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (len == 1) {
11932 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11933 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11934 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935
11936 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 for (i = 0; i < len; i++) {
11941 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011942 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011945 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946}
11947
INADA Naoki3ae20562017-01-16 20:41:20 +090011948/*[clinic input]
11949str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
INADA Naoki3ae20562017-01-16 20:41:20 +090011951Return True if the string is a decimal string, False otherwise.
11952
11953A string is a decimal string if all characters in the string are decimal and
11954there is at least one character in the string.
11955[clinic start generated code]*/
11956
11957static PyObject *
11958unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011959/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 Py_ssize_t i, length;
11962 int kind;
11963 void *data;
11964
11965 if (PyUnicode_READY(self) == -1)
11966 return NULL;
11967 length = PyUnicode_GET_LENGTH(self);
11968 kind = PyUnicode_KIND(self);
11969 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (length == 1)
11973 return PyBool_FromLong(
11974 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011976 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011978 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 for (i = 0; i < length; i++) {
11981 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011984 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985}
11986
INADA Naoki3ae20562017-01-16 20:41:20 +090011987/*[clinic input]
11988str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
INADA Naoki3ae20562017-01-16 20:41:20 +090011990Return True if the string is a digit string, False otherwise.
11991
11992A string is a digit string if all characters in the string are digits and there
11993is at least one character in the string.
11994[clinic start generated code]*/
11995
11996static PyObject *
11997unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011998/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 Py_ssize_t i, length;
12001 int kind;
12002 void *data;
12003
12004 if (PyUnicode_READY(self) == -1)
12005 return NULL;
12006 length = PyUnicode_GET_LENGTH(self);
12007 kind = PyUnicode_KIND(self);
12008 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if (length == 1) {
12012 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12013 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012016 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012018 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 for (i = 0; i < length; i++) {
12021 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012022 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012024 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025}
12026
INADA Naoki3ae20562017-01-16 20:41:20 +090012027/*[clinic input]
12028str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
INADA Naoki3ae20562017-01-16 20:41:20 +090012030Return True if the string is a numeric string, False otherwise.
12031
12032A string is numeric if all characters in the string are numeric and there is at
12033least one character in the string.
12034[clinic start generated code]*/
12035
12036static PyObject *
12037unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012038/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 Py_ssize_t i, length;
12041 int kind;
12042 void *data;
12043
12044 if (PyUnicode_READY(self) == -1)
12045 return NULL;
12046 length = PyUnicode_GET_LENGTH(self);
12047 kind = PyUnicode_KIND(self);
12048 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 if (length == 1)
12052 return PyBool_FromLong(
12053 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012055 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012057 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 for (i = 0; i < length; i++) {
12060 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064}
12065
Martin v. Löwis47383402007-08-15 07:32:56 +000012066int
12067PyUnicode_IsIdentifier(PyObject *self)
12068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 int kind;
12070 void *data;
12071 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012072 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 if (PyUnicode_READY(self) == -1) {
12075 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 }
12078
12079 /* Special case for empty strings */
12080 if (PyUnicode_GET_LENGTH(self) == 0)
12081 return 0;
12082 kind = PyUnicode_KIND(self);
12083 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012084
12085 /* PEP 3131 says that the first character must be in
12086 XID_Start and subsequent characters in XID_Continue,
12087 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012088 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012089 letters, digits, underscore). However, given the current
12090 definition of XID_Start and XID_Continue, it is sufficient
12091 to check just for these, except that _ must be allowed
12092 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012094 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012095 return 0;
12096
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012097 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012100 return 1;
12101}
12102
INADA Naoki3ae20562017-01-16 20:41:20 +090012103/*[clinic input]
12104str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012105
INADA Naoki3ae20562017-01-16 20:41:20 +090012106Return True if the string is a valid Python identifier, False otherwise.
12107
12108Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12109"class".
12110[clinic start generated code]*/
12111
12112static PyObject *
12113unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012114/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012115{
12116 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12117}
12118
INADA Naoki3ae20562017-01-16 20:41:20 +090012119/*[clinic input]
12120str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012121
INADA Naoki3ae20562017-01-16 20:41:20 +090012122Return True if the string is printable, False otherwise.
12123
12124A string is printable if all of its characters are considered printable in
12125repr() or if it is empty.
12126[clinic start generated code]*/
12127
12128static PyObject *
12129unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012130/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 Py_ssize_t i, length;
12133 int kind;
12134 void *data;
12135
12136 if (PyUnicode_READY(self) == -1)
12137 return NULL;
12138 length = PyUnicode_GET_LENGTH(self);
12139 kind = PyUnicode_KIND(self);
12140 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012141
12142 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (length == 1)
12144 return PyBool_FromLong(
12145 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 for (i = 0; i < length; i++) {
12148 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012149 Py_RETURN_FALSE;
12150 }
12151 }
12152 Py_RETURN_TRUE;
12153}
12154
INADA Naoki3ae20562017-01-16 20:41:20 +090012155/*[clinic input]
12156str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
INADA Naoki3ae20562017-01-16 20:41:20 +090012158 iterable: object
12159 /
12160
12161Concatenate any number of strings.
12162
Martin Panter91a88662017-01-24 00:30:06 +000012163The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012164The result is returned as a new string.
12165
12166Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012171/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
INADA Naoki3ae20562017-01-16 20:41:20 +090012173 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174}
12175
Martin v. Löwis18e16552006-02-15 17:27:45 +000012176static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012177unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (PyUnicode_READY(self) == -1)
12180 return -1;
12181 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182}
12183
INADA Naoki3ae20562017-01-16 20:41:20 +090012184/*[clinic input]
12185str.ljust as unicode_ljust
12186
12187 width: Py_ssize_t
12188 fillchar: Py_UCS4 = ' '
12189 /
12190
12191Return a left-justified string of length width.
12192
12193Padding is done using the specified fill character (default is a space).
12194[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
12196static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012197unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12198/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012200 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202
Victor Stinnerc4b49542011-12-11 22:44:26 +010012203 if (PyUnicode_GET_LENGTH(self) >= width)
12204 return unicode_result_unchanged(self);
12205
12206 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207}
12208
INADA Naoki3ae20562017-01-16 20:41:20 +090012209/*[clinic input]
12210str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
INADA Naoki3ae20562017-01-16 20:41:20 +090012212Return a copy of the string converted to lowercase.
12213[clinic start generated code]*/
12214
12215static PyObject *
12216unicode_lower_impl(PyObject *self)
12217/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012219 if (PyUnicode_READY(self) == -1)
12220 return NULL;
12221 if (PyUnicode_IS_ASCII(self))
12222 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012223 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224}
12225
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012226#define LEFTSTRIP 0
12227#define RIGHTSTRIP 1
12228#define BOTHSTRIP 2
12229
12230/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012231static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012232
INADA Naoki3ae20562017-01-16 20:41:20 +090012233#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012234
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012235/* externally visible for str.strip(unicode) */
12236PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012237_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 void *data;
12240 int kind;
12241 Py_ssize_t i, j, len;
12242 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012243 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12246 return NULL;
12247
12248 kind = PyUnicode_KIND(self);
12249 data = PyUnicode_DATA(self);
12250 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012251 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12253 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012254 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255
Benjamin Peterson14339b62009-01-31 16:36:08 +000012256 i = 0;
12257 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012258 while (i < len) {
12259 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12260 if (!BLOOM(sepmask, ch))
12261 break;
12262 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12263 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 i++;
12265 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012266 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012267
Benjamin Peterson14339b62009-01-31 16:36:08 +000012268 j = len;
12269 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012270 j--;
12271 while (j >= i) {
12272 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12273 if (!BLOOM(sepmask, ch))
12274 break;
12275 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12276 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012278 }
12279
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012281 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012282
Victor Stinner7931d9a2011-11-04 00:22:48 +010012283 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284}
12285
12286PyObject*
12287PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12288{
12289 unsigned char *data;
12290 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012291 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292
Victor Stinnerde636f32011-10-01 03:55:54 +020012293 if (PyUnicode_READY(self) == -1)
12294 return NULL;
12295
Victor Stinner684d5fd2012-05-03 02:32:34 +020012296 length = PyUnicode_GET_LENGTH(self);
12297 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012298
Victor Stinner684d5fd2012-05-03 02:32:34 +020012299 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012300 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301
Victor Stinnerde636f32011-10-01 03:55:54 +020012302 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012303 PyErr_SetString(PyExc_IndexError, "string index out of range");
12304 return NULL;
12305 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012306 if (start >= length || end < start)
12307 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012308
Victor Stinner684d5fd2012-05-03 02:32:34 +020012309 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012310 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012311 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012312 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012313 }
12314 else {
12315 kind = PyUnicode_KIND(self);
12316 data = PyUnicode_1BYTE_DATA(self);
12317 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012318 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012319 length);
12320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
12323static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012324do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 Py_ssize_t len, i, j;
12327
12328 if (PyUnicode_READY(self) == -1)
12329 return NULL;
12330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012332
Victor Stinnercc7af722013-04-09 22:39:24 +020012333 if (PyUnicode_IS_ASCII(self)) {
12334 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12335
12336 i = 0;
12337 if (striptype != RIGHTSTRIP) {
12338 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012339 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012340 if (!_Py_ascii_whitespace[ch])
12341 break;
12342 i++;
12343 }
12344 }
12345
12346 j = len;
12347 if (striptype != LEFTSTRIP) {
12348 j--;
12349 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012350 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012351 if (!_Py_ascii_whitespace[ch])
12352 break;
12353 j--;
12354 }
12355 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012356 }
12357 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012358 else {
12359 int kind = PyUnicode_KIND(self);
12360 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012361
Victor Stinnercc7af722013-04-09 22:39:24 +020012362 i = 0;
12363 if (striptype != RIGHTSTRIP) {
12364 while (i < len) {
12365 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12366 if (!Py_UNICODE_ISSPACE(ch))
12367 break;
12368 i++;
12369 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012370 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012371
12372 j = len;
12373 if (striptype != LEFTSTRIP) {
12374 j--;
12375 while (j >= i) {
12376 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12377 if (!Py_UNICODE_ISSPACE(ch))
12378 break;
12379 j--;
12380 }
12381 j++;
12382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012384
Victor Stinner7931d9a2011-11-04 00:22:48 +010012385 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386}
12387
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012388
12389static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012390do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012391{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 if (sep != NULL && sep != Py_None) {
12393 if (PyUnicode_Check(sep))
12394 return _PyUnicode_XStrip(self, striptype, sep);
12395 else {
12396 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 "%s arg must be None or str",
12398 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 return NULL;
12400 }
12401 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402
Benjamin Peterson14339b62009-01-31 16:36:08 +000012403 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404}
12405
12406
INADA Naoki3ae20562017-01-16 20:41:20 +090012407/*[clinic input]
12408str.strip as unicode_strip
12409
12410 chars: object = None
12411 /
12412
Victor Stinner0c4a8282017-01-17 02:21:47 +010012413Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012414
12415If chars is given and not None, remove characters in chars instead.
12416[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012417
12418static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012419unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012420/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012421{
INADA Naoki3ae20562017-01-16 20:41:20 +090012422 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423}
12424
12425
INADA Naoki3ae20562017-01-16 20:41:20 +090012426/*[clinic input]
12427str.lstrip as unicode_lstrip
12428
12429 chars: object = NULL
12430 /
12431
12432Return a copy of the string with leading whitespace removed.
12433
12434If chars is given and not None, remove characters in chars instead.
12435[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012436
12437static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012438unicode_lstrip_impl(PyObject *self, PyObject *chars)
12439/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440{
INADA Naoki3ae20562017-01-16 20:41:20 +090012441 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012442}
12443
12444
INADA Naoki3ae20562017-01-16 20:41:20 +090012445/*[clinic input]
12446str.rstrip as unicode_rstrip
12447
12448 chars: object = NULL
12449 /
12450
12451Return a copy of the string with trailing whitespace removed.
12452
12453If chars is given and not None, remove characters in chars instead.
12454[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455
12456static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012457unicode_rstrip_impl(PyObject *self, PyObject *chars)
12458/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459{
INADA Naoki3ae20562017-01-16 20:41:20 +090012460 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461}
12462
12463
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012465unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012467 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
Serhiy Storchaka05997252013-01-26 12:14:02 +020012470 if (len < 1)
12471 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472
Victor Stinnerc4b49542011-12-11 22:44:26 +010012473 /* no repeat, return original string */
12474 if (len == 1)
12475 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012476
Benjamin Petersonbac79492012-01-14 13:34:47 -050012477 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 return NULL;
12479
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012480 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012481 PyErr_SetString(PyExc_OverflowError,
12482 "repeated string is too long");
12483 return NULL;
12484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012486
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012487 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488 if (!u)
12489 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012490 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 if (PyUnicode_GET_LENGTH(str) == 1) {
12493 const int kind = PyUnicode_KIND(str);
12494 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012495 if (kind == PyUnicode_1BYTE_KIND) {
12496 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012497 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012498 }
12499 else if (kind == PyUnicode_2BYTE_KIND) {
12500 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012501 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012502 ucs2[n] = fill_char;
12503 } else {
12504 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12505 assert(kind == PyUnicode_4BYTE_KIND);
12506 for (n = 0; n < len; ++n)
12507 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 }
12510 else {
12511 /* number of characters copied this far */
12512 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012513 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012515 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012517 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012519 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012520 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522 }
12523
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012524 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012525 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526}
12527
Alexander Belopolsky40018472011-02-26 01:02:56 +000012528PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012529PyUnicode_Replace(PyObject *str,
12530 PyObject *substr,
12531 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012532 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012534 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12535 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012537 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538}
12539
INADA Naoki3ae20562017-01-16 20:41:20 +090012540/*[clinic input]
12541str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
INADA Naoki3ae20562017-01-16 20:41:20 +090012543 old: unicode
12544 new: unicode
12545 count: Py_ssize_t = -1
12546 Maximum number of occurrences to replace.
12547 -1 (the default value) means replace all occurrences.
12548 /
12549
12550Return a copy with all occurrences of substring old replaced by new.
12551
12552If the optional argument count is given, only the first count occurrences are
12553replaced.
12554[clinic start generated code]*/
12555
12556static PyObject *
12557unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12558 Py_ssize_t count)
12559/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012561 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012563 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564}
12565
Alexander Belopolsky40018472011-02-26 01:02:56 +000012566static PyObject *
12567unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012569 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 Py_ssize_t isize;
12571 Py_ssize_t osize, squote, dquote, i, o;
12572 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012573 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012577 return NULL;
12578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 isize = PyUnicode_GET_LENGTH(unicode);
12580 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 /* Compute length of output, quote characters, and
12583 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012584 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 max = 127;
12586 squote = dquote = 0;
12587 ikind = PyUnicode_KIND(unicode);
12588 for (i = 0; i < isize; i++) {
12589 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012590 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012592 case '\'': squote++; break;
12593 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012595 incr = 2;
12596 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 default:
12598 /* Fast-path ASCII */
12599 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012600 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012602 ;
12603 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012606 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012608 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012610 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012612 if (osize > PY_SSIZE_T_MAX - incr) {
12613 PyErr_SetString(PyExc_OverflowError,
12614 "string is too long to generate repr");
12615 return NULL;
12616 }
12617 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 }
12619
12620 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012621 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012623 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 if (dquote)
12625 /* Both squote and dquote present. Use squote,
12626 and escape them */
12627 osize += squote;
12628 else
12629 quote = '"';
12630 }
Victor Stinner55c08782013-04-14 18:45:39 +020012631 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632
12633 repr = PyUnicode_New(osize, max);
12634 if (repr == NULL)
12635 return NULL;
12636 okind = PyUnicode_KIND(repr);
12637 odata = PyUnicode_DATA(repr);
12638
12639 PyUnicode_WRITE(okind, odata, 0, quote);
12640 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012641 if (unchanged) {
12642 _PyUnicode_FastCopyCharacters(repr, 1,
12643 unicode, 0,
12644 isize);
12645 }
12646 else {
12647 for (i = 0, o = 1; i < isize; i++) {
12648 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649
Victor Stinner55c08782013-04-14 18:45:39 +020012650 /* Escape quotes and backslashes */
12651 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012652 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012654 continue;
12655 }
12656
12657 /* Map special whitespace to '\t', \n', '\r' */
12658 if (ch == '\t') {
12659 PyUnicode_WRITE(okind, odata, o++, '\\');
12660 PyUnicode_WRITE(okind, odata, o++, 't');
12661 }
12662 else if (ch == '\n') {
12663 PyUnicode_WRITE(okind, odata, o++, '\\');
12664 PyUnicode_WRITE(okind, odata, o++, 'n');
12665 }
12666 else if (ch == '\r') {
12667 PyUnicode_WRITE(okind, odata, o++, '\\');
12668 PyUnicode_WRITE(okind, odata, o++, 'r');
12669 }
12670
12671 /* Map non-printable US ASCII to '\xhh' */
12672 else if (ch < ' ' || ch == 0x7F) {
12673 PyUnicode_WRITE(okind, odata, o++, '\\');
12674 PyUnicode_WRITE(okind, odata, o++, 'x');
12675 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12676 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12677 }
12678
12679 /* Copy ASCII characters as-is */
12680 else if (ch < 0x7F) {
12681 PyUnicode_WRITE(okind, odata, o++, ch);
12682 }
12683
12684 /* Non-ASCII characters */
12685 else {
12686 /* Map Unicode whitespace and control characters
12687 (categories Z* and C* except ASCII space)
12688 */
12689 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12690 PyUnicode_WRITE(okind, odata, o++, '\\');
12691 /* Map 8-bit characters to '\xhh' */
12692 if (ch <= 0xff) {
12693 PyUnicode_WRITE(okind, odata, o++, 'x');
12694 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12695 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12696 }
12697 /* Map 16-bit characters to '\uxxxx' */
12698 else if (ch <= 0xffff) {
12699 PyUnicode_WRITE(okind, odata, o++, 'u');
12700 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12701 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12702 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12703 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12704 }
12705 /* Map 21-bit characters to '\U00xxxxxx' */
12706 else {
12707 PyUnicode_WRITE(okind, odata, o++, 'U');
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12713 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12714 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12715 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12716 }
12717 }
12718 /* Copy characters as-is */
12719 else {
12720 PyUnicode_WRITE(okind, odata, o++, ch);
12721 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012722 }
12723 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012726 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012727 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728}
12729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012730PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732\n\
12733Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012734such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735arguments start and end are interpreted as in slice notation.\n\
12736\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012737Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738
12739static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012742 /* initialize variables to prevent gcc warning */
12743 PyObject *substring = NULL;
12744 Py_ssize_t start = 0;
12745 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012746 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012748 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012751 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012754 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 if (result == -2)
12757 return NULL;
12758
Christian Heimes217cfd12007-12-02 14:31:20 +000012759 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760}
12761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012762PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012765Return the highest index in S where substring sub is found,\n\
12766such that sub is contained within S[start:end]. Optional\n\
12767arguments start and end are interpreted as in slice notation.\n\
12768\n\
12769Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770
12771static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012774 /* initialize variables to prevent gcc warning */
12775 PyObject *substring = NULL;
12776 Py_ssize_t start = 0;
12777 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012778 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012780 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012783 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012786 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 if (result == -2)
12789 return NULL;
12790
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791 if (result < 0) {
12792 PyErr_SetString(PyExc_ValueError, "substring not found");
12793 return NULL;
12794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795
Christian Heimes217cfd12007-12-02 14:31:20 +000012796 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797}
12798
INADA Naoki3ae20562017-01-16 20:41:20 +090012799/*[clinic input]
12800str.rjust as unicode_rjust
12801
12802 width: Py_ssize_t
12803 fillchar: Py_UCS4 = ' '
12804 /
12805
12806Return a right-justified string of length width.
12807
12808Padding is done using the specified fill character (default is a space).
12809[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
12811static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012812unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12813/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012815 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816 return NULL;
12817
Victor Stinnerc4b49542011-12-11 22:44:26 +010012818 if (PyUnicode_GET_LENGTH(self) >= width)
12819 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
Victor Stinnerc4b49542011-12-11 22:44:26 +010012821 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822}
12823
Alexander Belopolsky40018472011-02-26 01:02:56 +000012824PyObject *
12825PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012827 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012830 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831}
12832
INADA Naoki3ae20562017-01-16 20:41:20 +090012833/*[clinic input]
12834str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
INADA Naoki3ae20562017-01-16 20:41:20 +090012836 sep: object = None
12837 The delimiter according which to split the string.
12838 None (the default value) means split according to any whitespace,
12839 and discard empty strings from the result.
12840 maxsplit: Py_ssize_t = -1
12841 Maximum number of splits to do.
12842 -1 (the default value) means no limit.
12843
12844Return a list of the words in the string, using sep as the delimiter string.
12845[clinic start generated code]*/
12846
12847static PyObject *
12848unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12849/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850{
INADA Naoki3ae20562017-01-16 20:41:20 +090012851 if (sep == Py_None)
12852 return split(self, NULL, maxsplit);
12853 if (PyUnicode_Check(sep))
12854 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012855
Victor Stinner886483e2018-09-07 18:00:58 +020012856 PyErr_Format(PyExc_TypeError, "must be str or None, not %T", sep);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858}
12859
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012861PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012862{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012863 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012864 int kind1, kind2;
12865 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012868 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012870
Victor Stinner14f8f022011-10-05 20:58:25 +020012871 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 len1 = PyUnicode_GET_LENGTH(str_obj);
12874 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012875 if (kind1 < kind2 || len1 < len2) {
12876 _Py_INCREF_UNICODE_EMPTY();
12877 if (!unicode_empty)
12878 out = NULL;
12879 else {
12880 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12881 Py_DECREF(unicode_empty);
12882 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012883 return out;
12884 }
12885 buf1 = PyUnicode_DATA(str_obj);
12886 buf2 = PyUnicode_DATA(sep_obj);
12887 if (kind2 != kind1) {
12888 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12889 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012890 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012893 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012895 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12896 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12897 else
12898 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 break;
12900 case PyUnicode_2BYTE_KIND:
12901 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12902 break;
12903 case PyUnicode_4BYTE_KIND:
12904 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12905 break;
12906 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012907 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012910 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912
12913 return out;
12914}
12915
12916
12917PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012918PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012921 int kind1, kind2;
12922 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012925 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012928 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 len1 = PyUnicode_GET_LENGTH(str_obj);
12931 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 if (kind1 < kind2 || len1 < len2) {
12933 _Py_INCREF_UNICODE_EMPTY();
12934 if (!unicode_empty)
12935 out = NULL;
12936 else {
12937 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12938 Py_DECREF(unicode_empty);
12939 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012940 return out;
12941 }
12942 buf1 = PyUnicode_DATA(str_obj);
12943 buf2 = PyUnicode_DATA(sep_obj);
12944 if (kind2 != kind1) {
12945 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12946 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012947 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012950 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012952 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12953 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12954 else
12955 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 break;
12957 case PyUnicode_2BYTE_KIND:
12958 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 break;
12960 case PyUnicode_4BYTE_KIND:
12961 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12962 break;
12963 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012964 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012967 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969
12970 return out;
12971}
12972
INADA Naoki3ae20562017-01-16 20:41:20 +090012973/*[clinic input]
12974str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012975
INADA Naoki3ae20562017-01-16 20:41:20 +090012976 sep: object
12977 /
12978
12979Partition the string into three parts using the given separator.
12980
12981This will search for the separator in the string. If the separator is found,
12982returns a 3-tuple containing the part before the separator, the separator
12983itself, and the part after it.
12984
12985If the separator is not found, returns a 3-tuple containing the original string
12986and two empty strings.
12987[clinic start generated code]*/
12988
12989static PyObject *
12990unicode_partition(PyObject *self, PyObject *sep)
12991/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992{
INADA Naoki3ae20562017-01-16 20:41:20 +090012993 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994}
12995
INADA Naoki3ae20562017-01-16 20:41:20 +090012996/*[clinic input]
12997str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998
INADA Naoki3ae20562017-01-16 20:41:20 +090012999Partition the string into three parts using the given separator.
13000
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013001This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013002the separator is found, returns a 3-tuple containing the part before the
13003separator, the separator itself, and the part after it.
13004
13005If the separator is not found, returns a 3-tuple containing two empty strings
13006and the original string.
13007[clinic start generated code]*/
13008
13009static PyObject *
13010unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013011/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013012{
INADA Naoki3ae20562017-01-16 20:41:20 +090013013 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013014}
13015
Alexander Belopolsky40018472011-02-26 01:02:56 +000013016PyObject *
13017PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013018{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013019 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013020 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013021
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013022 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013023}
13024
INADA Naoki3ae20562017-01-16 20:41:20 +090013025/*[clinic input]
13026str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013027
INADA Naoki3ae20562017-01-16 20:41:20 +090013028Return a list of the words in the string, using sep as the delimiter string.
13029
13030Splits are done starting at the end of the string and working to the front.
13031[clinic start generated code]*/
13032
13033static PyObject *
13034unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13035/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013036{
INADA Naoki3ae20562017-01-16 20:41:20 +090013037 if (sep == Py_None)
13038 return rsplit(self, NULL, maxsplit);
13039 if (PyUnicode_Check(sep))
13040 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013041
Victor Stinner886483e2018-09-07 18:00:58 +020013042 PyErr_Format(PyExc_TypeError, "must be str or None, not %T", sep);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013043 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013044}
13045
INADA Naoki3ae20562017-01-16 20:41:20 +090013046/*[clinic input]
13047str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013049 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013050
13051Return a list of the lines in the string, breaking at line boundaries.
13052
13053Line breaks are not included in the resulting list unless keepends is given and
13054true.
13055[clinic start generated code]*/
13056
13057static PyObject *
13058unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013059/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013061 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062}
13063
13064static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013065PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013067 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068}
13069
INADA Naoki3ae20562017-01-16 20:41:20 +090013070/*[clinic input]
13071str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072
INADA Naoki3ae20562017-01-16 20:41:20 +090013073Convert uppercase characters to lowercase and lowercase characters to uppercase.
13074[clinic start generated code]*/
13075
13076static PyObject *
13077unicode_swapcase_impl(PyObject *self)
13078/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013080 if (PyUnicode_READY(self) == -1)
13081 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013082 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083}
13084
Larry Hastings61272b72014-01-07 12:41:53 -080013085/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013086
Larry Hastings31826802013-10-19 00:09:25 -070013087@staticmethod
13088str.maketrans as unicode_maketrans
13089
13090 x: object
13091
13092 y: unicode=NULL
13093
13094 z: unicode=NULL
13095
13096 /
13097
13098Return a translation table usable for str.translate().
13099
13100If there is only one argument, it must be a dictionary mapping Unicode
13101ordinals (integers) or characters to Unicode ordinals, strings or None.
13102Character keys will be then converted to ordinals.
13103If there are two arguments, they must be strings of equal length, and
13104in the resulting dictionary, each character in x will be mapped to the
13105character at the same position in y. If there is a third argument, it
13106must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013107[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013108
Larry Hastings31826802013-10-19 00:09:25 -070013109static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013110unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013111/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013112{
Georg Brandlceee0772007-11-27 23:48:05 +000013113 PyObject *new = NULL, *key, *value;
13114 Py_ssize_t i = 0;
13115 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116
Georg Brandlceee0772007-11-27 23:48:05 +000013117 new = PyDict_New();
13118 if (!new)
13119 return NULL;
13120 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 int x_kind, y_kind, z_kind;
13122 void *x_data, *y_data, *z_data;
13123
Georg Brandlceee0772007-11-27 23:48:05 +000013124 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013125 if (!PyUnicode_Check(x)) {
13126 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13127 "be a string if there is a second argument");
13128 goto err;
13129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013131 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13132 "arguments must have equal length");
13133 goto err;
13134 }
13135 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 x_kind = PyUnicode_KIND(x);
13137 y_kind = PyUnicode_KIND(y);
13138 x_data = PyUnicode_DATA(x);
13139 y_data = PyUnicode_DATA(y);
13140 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13141 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013142 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013143 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013144 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013145 if (!value) {
13146 Py_DECREF(key);
13147 goto err;
13148 }
Georg Brandlceee0772007-11-27 23:48:05 +000013149 res = PyDict_SetItem(new, key, value);
13150 Py_DECREF(key);
13151 Py_DECREF(value);
13152 if (res < 0)
13153 goto err;
13154 }
13155 /* create entries for deleting chars in z */
13156 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 z_kind = PyUnicode_KIND(z);
13158 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013159 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013161 if (!key)
13162 goto err;
13163 res = PyDict_SetItem(new, key, Py_None);
13164 Py_DECREF(key);
13165 if (res < 0)
13166 goto err;
13167 }
13168 }
13169 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 int kind;
13171 void *data;
13172
Georg Brandlceee0772007-11-27 23:48:05 +000013173 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013174 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013175 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13176 "to maketrans it must be a dict");
13177 goto err;
13178 }
13179 /* copy entries into the new dict, converting string keys to int keys */
13180 while (PyDict_Next(x, &i, &key, &value)) {
13181 if (PyUnicode_Check(key)) {
13182 /* convert string keys to integer keys */
13183 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013184 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013185 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13186 "table must be of length 1");
13187 goto err;
13188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013189 kind = PyUnicode_KIND(key);
13190 data = PyUnicode_DATA(key);
13191 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013192 if (!newkey)
13193 goto err;
13194 res = PyDict_SetItem(new, newkey, value);
13195 Py_DECREF(newkey);
13196 if (res < 0)
13197 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013198 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013199 /* just keep integer keys */
13200 if (PyDict_SetItem(new, key, value) < 0)
13201 goto err;
13202 } else {
13203 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13204 "be strings or integers");
13205 goto err;
13206 }
13207 }
13208 }
13209 return new;
13210 err:
13211 Py_DECREF(new);
13212 return NULL;
13213}
13214
INADA Naoki3ae20562017-01-16 20:41:20 +090013215/*[clinic input]
13216str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
INADA Naoki3ae20562017-01-16 20:41:20 +090013218 table: object
13219 Translation table, which must be a mapping of Unicode ordinals to
13220 Unicode ordinals, strings, or None.
13221 /
13222
13223Replace each character in the string using the given translation table.
13224
13225The table must implement lookup/indexing via __getitem__, for instance a
13226dictionary or list. If this operation raises LookupError, the character is
13227left untouched. Characters mapped to None are deleted.
13228[clinic start generated code]*/
13229
13230static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013232/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235}
13236
INADA Naoki3ae20562017-01-16 20:41:20 +090013237/*[clinic input]
13238str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239
INADA Naoki3ae20562017-01-16 20:41:20 +090013240Return a copy of the string converted to uppercase.
13241[clinic start generated code]*/
13242
13243static PyObject *
13244unicode_upper_impl(PyObject *self)
13245/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013247 if (PyUnicode_READY(self) == -1)
13248 return NULL;
13249 if (PyUnicode_IS_ASCII(self))
13250 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013251 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252}
13253
INADA Naoki3ae20562017-01-16 20:41:20 +090013254/*[clinic input]
13255str.zfill as unicode_zfill
13256
13257 width: Py_ssize_t
13258 /
13259
13260Pad a numeric string with zeros on the left, to fill a field of the given width.
13261
13262The string is never truncated.
13263[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264
13265static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013266unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013267/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013269 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013270 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 int kind;
13272 void *data;
13273 Py_UCS4 chr;
13274
Benjamin Petersonbac79492012-01-14 13:34:47 -050013275 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277
Victor Stinnerc4b49542011-12-11 22:44:26 +010013278 if (PyUnicode_GET_LENGTH(self) >= width)
13279 return unicode_result_unchanged(self);
13280
13281 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282
13283 u = pad(self, fill, 0, '0');
13284
Walter Dörwald068325e2002-04-15 13:36:47 +000013285 if (u == NULL)
13286 return NULL;
13287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 kind = PyUnicode_KIND(u);
13289 data = PyUnicode_DATA(u);
13290 chr = PyUnicode_READ(kind, data, fill);
13291
13292 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 PyUnicode_WRITE(kind, data, 0, chr);
13295 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296 }
13297
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013298 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013299 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301
13302#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013303static PyObject *
13304unicode__decimal2ascii(PyObject *self)
13305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013307}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308#endif
13309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013310PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013313Return True if S starts with the specified prefix, False otherwise.\n\
13314With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013315With optional end, stop comparing S at that position.\n\
13316prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317
13318static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013319unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013320 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013321{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013322 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013323 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013324 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013325 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013326 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
Jesus Ceaac451502011-04-20 17:09:23 +020013328 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013329 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013330 if (PyTuple_Check(subobj)) {
13331 Py_ssize_t i;
13332 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013333 substring = PyTuple_GET_ITEM(subobj, i);
13334 if (!PyUnicode_Check(substring)) {
13335 PyErr_Format(PyExc_TypeError,
13336 "tuple for startswith must only contain str, "
Victor Stinner886483e2018-09-07 18:00:58 +020013337 "not %T",
13338 substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013339 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013340 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013341 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013342 if (result == -1)
13343 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013344 if (result) {
13345 Py_RETURN_TRUE;
13346 }
13347 }
13348 /* nothing matched */
13349 Py_RETURN_FALSE;
13350 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013351 if (!PyUnicode_Check(subobj)) {
13352 PyErr_Format(PyExc_TypeError,
13353 "startswith first arg must be str or "
Victor Stinner886483e2018-09-07 18:00:58 +020013354 "a tuple of str, not %T", subobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013355 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013356 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013357 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013358 if (result == -1)
13359 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013360 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361}
13362
13363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013364PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013367Return True if S ends with the specified suffix, False otherwise.\n\
13368With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013369With optional end, stop comparing S at that position.\n\
13370suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371
13372static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013373unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013376 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013377 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013378 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013379 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013380 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381
Jesus Ceaac451502011-04-20 17:09:23 +020013382 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 if (PyTuple_Check(subobj)) {
13385 Py_ssize_t i;
13386 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013387 substring = PyTuple_GET_ITEM(subobj, i);
13388 if (!PyUnicode_Check(substring)) {
13389 PyErr_Format(PyExc_TypeError,
13390 "tuple for endswith must only contain str, "
Victor Stinner886483e2018-09-07 18:00:58 +020013391 "not %T",
13392 substring);
Benjamin Peterson29060642009-01-31 22:14:21 +000013393 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013394 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013395 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013396 if (result == -1)
13397 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013398 if (result) {
13399 Py_RETURN_TRUE;
13400 }
13401 }
13402 Py_RETURN_FALSE;
13403 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013404 if (!PyUnicode_Check(subobj)) {
13405 PyErr_Format(PyExc_TypeError,
13406 "endswith first arg must be str or "
Victor Stinner886483e2018-09-07 18:00:58 +020013407 "a tuple of str, not %T", subobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013409 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013410 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013411 if (result == -1)
13412 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013413 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414}
13415
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013416static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013417_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013418{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013419 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13420 writer->data = PyUnicode_DATA(writer->buffer);
13421
13422 if (!writer->readonly) {
13423 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013424 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013425 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013426 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013427 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13428 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13429 writer->kind = PyUnicode_WCHAR_KIND;
13430 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13431
Victor Stinner8f674cc2013-04-17 23:02:17 +020013432 /* Copy-on-write mode: set buffer size to 0 so
13433 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13434 * next write. */
13435 writer->size = 0;
13436 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013437}
13438
Victor Stinnerd3f08822012-05-29 12:57:52 +020013439void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013440_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013441{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013442 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013443
13444 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013445 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013446
13447 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13448 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13449 writer->kind = PyUnicode_WCHAR_KIND;
13450 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013451}
13452
Victor Stinnerd3f08822012-05-29 12:57:52 +020013453int
13454_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13455 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013456{
13457 Py_ssize_t newlen;
13458 PyObject *newbuffer;
13459
Victor Stinner2740e462016-09-06 16:58:36 -070013460 assert(maxchar <= MAX_UNICODE);
13461
Victor Stinnerca9381e2015-09-22 00:58:32 +020013462 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013463 assert((maxchar > writer->maxchar && length >= 0)
13464 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013465
Victor Stinner202fdca2012-05-07 12:47:02 +020013466 if (length > PY_SSIZE_T_MAX - writer->pos) {
13467 PyErr_NoMemory();
13468 return -1;
13469 }
13470 newlen = writer->pos + length;
13471
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013472 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013473
Victor Stinnerd3f08822012-05-29 12:57:52 +020013474 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013475 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013476 if (writer->overallocate
13477 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13478 /* overallocate to limit the number of realloc() */
13479 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013480 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013481 if (newlen < writer->min_length)
13482 newlen = writer->min_length;
13483
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484 writer->buffer = PyUnicode_New(newlen, maxchar);
13485 if (writer->buffer == NULL)
13486 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013489 if (writer->overallocate
13490 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13491 /* overallocate to limit the number of realloc() */
13492 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013493 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013494 if (newlen < writer->min_length)
13495 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013496
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013497 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013498 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013499 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013500 newbuffer = PyUnicode_New(newlen, maxchar);
13501 if (newbuffer == NULL)
13502 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13504 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013505 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013506 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013507 }
13508 else {
13509 newbuffer = resize_compact(writer->buffer, newlen);
13510 if (newbuffer == NULL)
13511 return -1;
13512 }
13513 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013514 }
13515 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013516 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517 newbuffer = PyUnicode_New(writer->size, maxchar);
13518 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013519 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013520 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13521 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013522 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013523 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013524 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013525 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013526
13527#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013528}
13529
Victor Stinnerca9381e2015-09-22 00:58:32 +020013530int
13531_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13532 enum PyUnicode_Kind kind)
13533{
13534 Py_UCS4 maxchar;
13535
13536 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13537 assert(writer->kind < kind);
13538
13539 switch (kind)
13540 {
13541 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13542 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13543 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13544 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013545 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013546 }
13547
13548 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13549}
13550
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013551static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013552_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013553{
Victor Stinner2740e462016-09-06 16:58:36 -070013554 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013555 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13556 return -1;
13557 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13558 writer->pos++;
13559 return 0;
13560}
13561
13562int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013563_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13564{
13565 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13566}
13567
13568int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13570{
13571 Py_UCS4 maxchar;
13572 Py_ssize_t len;
13573
13574 if (PyUnicode_READY(str) == -1)
13575 return -1;
13576 len = PyUnicode_GET_LENGTH(str);
13577 if (len == 0)
13578 return 0;
13579 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13580 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013581 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013582 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013583 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013584 Py_INCREF(str);
13585 writer->buffer = str;
13586 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013587 writer->pos += len;
13588 return 0;
13589 }
13590 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13591 return -1;
13592 }
13593 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13594 str, 0, len);
13595 writer->pos += len;
13596 return 0;
13597}
13598
Victor Stinnere215d962012-10-06 23:03:36 +020013599int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013600_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13601 Py_ssize_t start, Py_ssize_t end)
13602{
13603 Py_UCS4 maxchar;
13604 Py_ssize_t len;
13605
13606 if (PyUnicode_READY(str) == -1)
13607 return -1;
13608
13609 assert(0 <= start);
13610 assert(end <= PyUnicode_GET_LENGTH(str));
13611 assert(start <= end);
13612
13613 if (end == 0)
13614 return 0;
13615
13616 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13617 return _PyUnicodeWriter_WriteStr(writer, str);
13618
13619 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13620 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13621 else
13622 maxchar = writer->maxchar;
13623 len = end - start;
13624
13625 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13626 return -1;
13627
13628 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13629 str, start, len);
13630 writer->pos += len;
13631 return 0;
13632}
13633
13634int
Victor Stinner4a587072013-11-19 12:54:53 +010013635_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13636 const char *ascii, Py_ssize_t len)
13637{
13638 if (len == -1)
13639 len = strlen(ascii);
13640
13641 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13642
13643 if (writer->buffer == NULL && !writer->overallocate) {
13644 PyObject *str;
13645
13646 str = _PyUnicode_FromASCII(ascii, len);
13647 if (str == NULL)
13648 return -1;
13649
13650 writer->readonly = 1;
13651 writer->buffer = str;
13652 _PyUnicodeWriter_Update(writer);
13653 writer->pos += len;
13654 return 0;
13655 }
13656
13657 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13658 return -1;
13659
13660 switch (writer->kind)
13661 {
13662 case PyUnicode_1BYTE_KIND:
13663 {
13664 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13665 Py_UCS1 *data = writer->data;
13666
Christian Heimesf051e432016-09-13 20:22:02 +020013667 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013668 break;
13669 }
13670 case PyUnicode_2BYTE_KIND:
13671 {
13672 _PyUnicode_CONVERT_BYTES(
13673 Py_UCS1, Py_UCS2,
13674 ascii, ascii + len,
13675 (Py_UCS2 *)writer->data + writer->pos);
13676 break;
13677 }
13678 case PyUnicode_4BYTE_KIND:
13679 {
13680 _PyUnicode_CONVERT_BYTES(
13681 Py_UCS1, Py_UCS4,
13682 ascii, ascii + len,
13683 (Py_UCS4 *)writer->data + writer->pos);
13684 break;
13685 }
13686 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013687 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013688 }
13689
13690 writer->pos += len;
13691 return 0;
13692}
13693
13694int
13695_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13696 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013697{
13698 Py_UCS4 maxchar;
13699
13700 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13701 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13702 return -1;
13703 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13704 writer->pos += len;
13705 return 0;
13706}
13707
Victor Stinnerd3f08822012-05-29 12:57:52 +020013708PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013709_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013710{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013711 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013712
Victor Stinnerd3f08822012-05-29 12:57:52 +020013713 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013714 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013715 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013716 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013717
13718 str = writer->buffer;
13719 writer->buffer = NULL;
13720
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013721 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013722 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13723 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013724 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013725
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013726 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13727 PyObject *str2;
13728 str2 = resize_compact(str, writer->pos);
13729 if (str2 == NULL) {
13730 Py_DECREF(str);
13731 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013732 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013733 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013734 }
13735
Victor Stinner15a0bd32013-07-08 22:29:55 +020013736 assert(_PyUnicode_CheckConsistency(str, 1));
13737 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013738}
13739
Victor Stinnerd3f08822012-05-29 12:57:52 +020013740void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013741_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013742{
13743 Py_CLEAR(writer->buffer);
13744}
13745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013746#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013747
13748PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013750\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013751Return a formatted version of S, using substitutions from args and kwargs.\n\
13752The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013753
Eric Smith27bbca62010-11-04 17:06:58 +000013754PyDoc_STRVAR(format_map__doc__,
13755 "S.format_map(mapping) -> str\n\
13756\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013757Return a formatted version of S, using substitutions from mapping.\n\
13758The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013759
INADA Naoki3ae20562017-01-16 20:41:20 +090013760/*[clinic input]
13761str.__format__ as unicode___format__
13762
13763 format_spec: unicode
13764 /
13765
13766Return a formatted version of the string as described by format_spec.
13767[clinic start generated code]*/
13768
Eric Smith4a7d76d2008-05-30 18:10:19 +000013769static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013770unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013771/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013772{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013773 _PyUnicodeWriter writer;
13774 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013775
Victor Stinnerd3f08822012-05-29 12:57:52 +020013776 if (PyUnicode_READY(self) == -1)
13777 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013778 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013779 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13780 self, format_spec, 0,
13781 PyUnicode_GET_LENGTH(format_spec));
13782 if (ret == -1) {
13783 _PyUnicodeWriter_Dealloc(&writer);
13784 return NULL;
13785 }
13786 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013787}
13788
INADA Naoki3ae20562017-01-16 20:41:20 +090013789/*[clinic input]
13790str.__sizeof__ as unicode_sizeof
13791
13792Return the size of the string in memory, in bytes.
13793[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013794
13795static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013796unicode_sizeof_impl(PyObject *self)
13797/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 Py_ssize_t size;
13800
13801 /* If it's a compact object, account for base structure +
13802 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013803 if (PyUnicode_IS_COMPACT_ASCII(self))
13804 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13805 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013806 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013807 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 else {
13809 /* If it is a two-block object, account for base object, and
13810 for character block if present. */
13811 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013812 if (_PyUnicode_DATA_ANY(self))
13813 size += (PyUnicode_GET_LENGTH(self) + 1) *
13814 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013815 }
13816 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013817 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013818 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13819 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13820 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13821 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013822
13823 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013824}
13825
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013826static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013827unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013828{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013829 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013830 if (!copy)
13831 return NULL;
13832 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013833}
13834
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013836 UNICODE_ENCODE_METHODDEF
13837 UNICODE_REPLACE_METHODDEF
13838 UNICODE_SPLIT_METHODDEF
13839 UNICODE_RSPLIT_METHODDEF
13840 UNICODE_JOIN_METHODDEF
13841 UNICODE_CAPITALIZE_METHODDEF
13842 UNICODE_CASEFOLD_METHODDEF
13843 UNICODE_TITLE_METHODDEF
13844 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013845 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013846 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013847 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013848 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013849 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013850 UNICODE_LJUST_METHODDEF
13851 UNICODE_LOWER_METHODDEF
13852 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013853 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13854 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013855 UNICODE_RJUST_METHODDEF
13856 UNICODE_RSTRIP_METHODDEF
13857 UNICODE_RPARTITION_METHODDEF
13858 UNICODE_SPLITLINES_METHODDEF
13859 UNICODE_STRIP_METHODDEF
13860 UNICODE_SWAPCASE_METHODDEF
13861 UNICODE_TRANSLATE_METHODDEF
13862 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013863 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13864 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013865 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013866 UNICODE_ISLOWER_METHODDEF
13867 UNICODE_ISUPPER_METHODDEF
13868 UNICODE_ISTITLE_METHODDEF
13869 UNICODE_ISSPACE_METHODDEF
13870 UNICODE_ISDECIMAL_METHODDEF
13871 UNICODE_ISDIGIT_METHODDEF
13872 UNICODE_ISNUMERIC_METHODDEF
13873 UNICODE_ISALPHA_METHODDEF
13874 UNICODE_ISALNUM_METHODDEF
13875 UNICODE_ISIDENTIFIER_METHODDEF
13876 UNICODE_ISPRINTABLE_METHODDEF
13877 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013878 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013879 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013881 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013882 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013883#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013884 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013885 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886#endif
13887
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013888 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013889 {NULL, NULL}
13890};
13891
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013892static PyObject *
13893unicode_mod(PyObject *v, PyObject *w)
13894{
Brian Curtindfc80e32011-08-10 20:28:54 -050013895 if (!PyUnicode_Check(v))
13896 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013897 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013898}
13899
13900static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013901 0, /*nb_add*/
13902 0, /*nb_subtract*/
13903 0, /*nb_multiply*/
13904 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013905};
13906
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013908 (lenfunc) unicode_length, /* sq_length */
13909 PyUnicode_Concat, /* sq_concat */
13910 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13911 (ssizeargfunc) unicode_getitem, /* sq_item */
13912 0, /* sq_slice */
13913 0, /* sq_ass_item */
13914 0, /* sq_ass_slice */
13915 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916};
13917
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013918static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013919unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013921 if (PyUnicode_READY(self) == -1)
13922 return NULL;
13923
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013924 if (PyIndex_Check(item)) {
13925 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013926 if (i == -1 && PyErr_Occurred())
13927 return NULL;
13928 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013930 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013931 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013932 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013933 PyObject *result;
13934 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013935 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013936 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013937
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013938 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013939 return NULL;
13940 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013941 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13942 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013943
13944 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013945 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013946 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013947 slicelength == PyUnicode_GET_LENGTH(self)) {
13948 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013949 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013950 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013951 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013952 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013953 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013954 src_kind = PyUnicode_KIND(self);
13955 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013956 if (!PyUnicode_IS_ASCII(self)) {
13957 kind_limit = kind_maxchar_limit(src_kind);
13958 max_char = 0;
13959 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13960 ch = PyUnicode_READ(src_kind, src_data, cur);
13961 if (ch > max_char) {
13962 max_char = ch;
13963 if (max_char >= kind_limit)
13964 break;
13965 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013966 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013967 }
Victor Stinner55c99112011-10-13 01:17:06 +020013968 else
13969 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013970 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013971 if (result == NULL)
13972 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013973 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013974 dest_data = PyUnicode_DATA(result);
13975
13976 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013977 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13978 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013980 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013981 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982 } else {
13983 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13984 return NULL;
13985 }
13986}
13987
13988static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 (lenfunc)unicode_length, /* mp_length */
13990 (binaryfunc)unicode_subscript, /* mp_subscript */
13991 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013992};
13993
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995/* Helpers for PyUnicode_Format() */
13996
Victor Stinnera47082312012-10-04 02:19:54 +020013997struct unicode_formatter_t {
13998 PyObject *args;
13999 int args_owned;
14000 Py_ssize_t arglen, argidx;
14001 PyObject *dict;
14002
14003 enum PyUnicode_Kind fmtkind;
14004 Py_ssize_t fmtcnt, fmtpos;
14005 void *fmtdata;
14006 PyObject *fmtstr;
14007
14008 _PyUnicodeWriter writer;
14009};
14010
14011struct unicode_format_arg_t {
14012 Py_UCS4 ch;
14013 int flags;
14014 Py_ssize_t width;
14015 int prec;
14016 int sign;
14017};
14018
Guido van Rossumd57fd912000-03-10 22:53:23 +000014019static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014020unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014021{
Victor Stinnera47082312012-10-04 02:19:54 +020014022 Py_ssize_t argidx = ctx->argidx;
14023
14024 if (argidx < ctx->arglen) {
14025 ctx->argidx++;
14026 if (ctx->arglen < 0)
14027 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 else
Victor Stinnera47082312012-10-04 02:19:54 +020014029 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014030 }
14031 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014033 return NULL;
14034}
14035
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014036/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037
Victor Stinnera47082312012-10-04 02:19:54 +020014038/* Format a float into the writer if the writer is not NULL, or into *p_output
14039 otherwise.
14040
14041 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014042static int
Victor Stinnera47082312012-10-04 02:19:54 +020014043formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14044 PyObject **p_output,
14045 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014046{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014047 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014049 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014050 int prec;
14051 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014052
Guido van Rossumd57fd912000-03-10 22:53:23 +000014053 x = PyFloat_AsDouble(v);
14054 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014055 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014056
Victor Stinnera47082312012-10-04 02:19:54 +020014057 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014058 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014059 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014060
Victor Stinnera47082312012-10-04 02:19:54 +020014061 if (arg->flags & F_ALT)
14062 dtoa_flags = Py_DTSF_ALT;
14063 else
14064 dtoa_flags = 0;
14065 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014066 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014067 return -1;
14068 len = strlen(p);
14069 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014070 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014071 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014072 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014073 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014074 }
14075 else
14076 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014077 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014078 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014079}
14080
Victor Stinnerd0880d52012-04-27 23:40:13 +020014081/* formatlong() emulates the format codes d, u, o, x and X, and
14082 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14083 * Python's regular ints.
14084 * Return value: a new PyUnicodeObject*, or NULL if error.
14085 * The output string is of the form
14086 * "-"? ("0x" | "0X")? digit+
14087 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14088 * set in flags. The case of hex digits will be correct,
14089 * There will be at least prec digits, zero-filled on the left if
14090 * necessary to get that many.
14091 * val object to be converted
14092 * flags bitmask of format flags; only F_ALT is looked at
14093 * prec minimum number of digits; 0-fill on left if needed
14094 * type a character in [duoxX]; u acts the same as d
14095 *
14096 * CAUTION: o, x and X conversions on regular ints can never
14097 * produce a '-' sign, but can for Python's unbounded ints.
14098 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014099PyObject *
14100_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014101{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014102 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014104 Py_ssize_t i;
14105 int sign; /* 1 if '-', else 0 */
14106 int len; /* number of characters */
14107 Py_ssize_t llen;
14108 int numdigits; /* len == numnondigits + numdigits */
14109 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014110
Victor Stinnerd0880d52012-04-27 23:40:13 +020014111 /* Avoid exceeding SSIZE_T_MAX */
14112 if (prec > INT_MAX-3) {
14113 PyErr_SetString(PyExc_OverflowError,
14114 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014115 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014116 }
14117
14118 assert(PyLong_Check(val));
14119
14120 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014121 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014122 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014123 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014124 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014125 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014126 /* int and int subclasses should print numerically when a numeric */
14127 /* format code is used (see issue18780) */
14128 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014129 break;
14130 case 'o':
14131 numnondigits = 2;
14132 result = PyNumber_ToBase(val, 8);
14133 break;
14134 case 'x':
14135 case 'X':
14136 numnondigits = 2;
14137 result = PyNumber_ToBase(val, 16);
14138 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014139 }
14140 if (!result)
14141 return NULL;
14142
14143 assert(unicode_modifiable(result));
14144 assert(PyUnicode_IS_READY(result));
14145 assert(PyUnicode_IS_ASCII(result));
14146
14147 /* To modify the string in-place, there can only be one reference. */
14148 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014149 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014150 PyErr_BadInternalCall();
14151 return NULL;
14152 }
14153 buf = PyUnicode_DATA(result);
14154 llen = PyUnicode_GET_LENGTH(result);
14155 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014156 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014157 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014158 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159 return NULL;
14160 }
14161 len = (int)llen;
14162 sign = buf[0] == '-';
14163 numnondigits += sign;
14164 numdigits = len - numnondigits;
14165 assert(numdigits > 0);
14166
14167 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014168 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014169 (type == 'o' || type == 'x' || type == 'X'))) {
14170 assert(buf[sign] == '0');
14171 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14172 buf[sign+1] == 'o');
14173 numnondigits -= 2;
14174 buf += 2;
14175 len -= 2;
14176 if (sign)
14177 buf[0] = '-';
14178 assert(len == numnondigits + numdigits);
14179 assert(numdigits > 0);
14180 }
14181
14182 /* Fill with leading zeroes to meet minimum width. */
14183 if (prec > numdigits) {
14184 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14185 numnondigits + prec);
14186 char *b1;
14187 if (!r1) {
14188 Py_DECREF(result);
14189 return NULL;
14190 }
14191 b1 = PyBytes_AS_STRING(r1);
14192 for (i = 0; i < numnondigits; ++i)
14193 *b1++ = *buf++;
14194 for (i = 0; i < prec - numdigits; i++)
14195 *b1++ = '0';
14196 for (i = 0; i < numdigits; i++)
14197 *b1++ = *buf++;
14198 *b1 = '\0';
14199 Py_DECREF(result);
14200 result = r1;
14201 buf = PyBytes_AS_STRING(result);
14202 len = numnondigits + prec;
14203 }
14204
14205 /* Fix up case for hex conversions. */
14206 if (type == 'X') {
14207 /* Need to convert all lower case letters to upper case.
14208 and need to convert 0x to 0X (and -0x to -0X). */
14209 for (i = 0; i < len; i++)
14210 if (buf[i] >= 'a' && buf[i] <= 'x')
14211 buf[i] -= 'a'-'A';
14212 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014213 if (!PyUnicode_Check(result)
14214 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014215 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014216 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014217 Py_DECREF(result);
14218 result = unicode;
14219 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014220 else if (len != PyUnicode_GET_LENGTH(result)) {
14221 if (PyUnicode_Resize(&result, len) < 0)
14222 Py_CLEAR(result);
14223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014224 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014225}
14226
Ethan Furmandf3ed242014-01-05 06:50:30 -080014227/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014228 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014229 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014230 * -1 and raise an exception on error */
14231static int
Victor Stinnera47082312012-10-04 02:19:54 +020014232mainformatlong(PyObject *v,
14233 struct unicode_format_arg_t *arg,
14234 PyObject **p_output,
14235 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014236{
14237 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014238 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014239
14240 if (!PyNumber_Check(v))
14241 goto wrongtype;
14242
Ethan Furman9ab74802014-03-21 06:38:46 -070014243 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014244 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014245 if (type == 'o' || type == 'x' || type == 'X') {
14246 iobj = PyNumber_Index(v);
14247 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014248 if (PyErr_ExceptionMatches(PyExc_TypeError))
14249 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014250 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014251 }
14252 }
14253 else {
14254 iobj = PyNumber_Long(v);
14255 if (iobj == NULL ) {
14256 if (PyErr_ExceptionMatches(PyExc_TypeError))
14257 goto wrongtype;
14258 return -1;
14259 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014260 }
14261 assert(PyLong_Check(iobj));
14262 }
14263 else {
14264 iobj = v;
14265 Py_INCREF(iobj);
14266 }
14267
14268 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014269 && arg->width == -1 && arg->prec == -1
14270 && !(arg->flags & (F_SIGN | F_BLANK))
14271 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014272 {
14273 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014274 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014275 int base;
14276
Victor Stinnera47082312012-10-04 02:19:54 +020014277 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014278 {
14279 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014280 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 case 'd':
14282 case 'i':
14283 case 'u':
14284 base = 10;
14285 break;
14286 case 'o':
14287 base = 8;
14288 break;
14289 case 'x':
14290 case 'X':
14291 base = 16;
14292 break;
14293 }
14294
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014295 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14296 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014297 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014298 }
14299 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014300 return 1;
14301 }
14302
Ethan Furmanb95b5612015-01-23 20:05:18 -080014303 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304 Py_DECREF(iobj);
14305 if (res == NULL)
14306 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014307 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014308 return 0;
14309
14310wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014311 switch(type)
14312 {
14313 case 'o':
14314 case 'x':
14315 case 'X':
14316 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +020014317 "%%%c format: an integer is required, not %T",
14318 type, v);
Ethan Furman9ab74802014-03-21 06:38:46 -070014319 break;
14320 default:
14321 PyErr_Format(PyExc_TypeError,
Victor Stinner886483e2018-09-07 18:00:58 +020014322 "%%%c format: a number is required, not %T",
14323 type, v);
Ethan Furman9ab74802014-03-21 06:38:46 -070014324 break;
14325 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014326 return -1;
14327}
14328
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014329static Py_UCS4
14330formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014331{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014332 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014333 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014334 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014335 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014336 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014337 goto onError;
14338 }
14339 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014340 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014341 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014342 /* make sure number is a type of integer */
14343 if (!PyLong_Check(v)) {
14344 iobj = PyNumber_Index(v);
14345 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014346 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014347 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014348 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014349 Py_DECREF(iobj);
14350 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014351 else {
14352 x = PyLong_AsLong(v);
14353 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014354 if (x == -1 && PyErr_Occurred())
14355 goto onError;
14356
Victor Stinner8faf8212011-12-08 22:14:11 +010014357 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014358 PyErr_SetString(PyExc_OverflowError,
14359 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014360 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014361 }
14362
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014363 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014365
Benjamin Peterson29060642009-01-31 22:14:21 +000014366 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014367 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014368 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014369 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014370}
14371
Victor Stinnera47082312012-10-04 02:19:54 +020014372/* Parse options of an argument: flags, width, precision.
14373 Handle also "%(name)" syntax.
14374
14375 Return 0 if the argument has been formatted into arg->str.
14376 Return 1 if the argument has been written into ctx->writer,
14377 Raise an exception and return -1 on error. */
14378static int
14379unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14380 struct unicode_format_arg_t *arg)
14381{
14382#define FORMAT_READ(ctx) \
14383 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14384
14385 PyObject *v;
14386
Victor Stinnera47082312012-10-04 02:19:54 +020014387 if (arg->ch == '(') {
14388 /* Get argument value from a dictionary. Example: "%(name)s". */
14389 Py_ssize_t keystart;
14390 Py_ssize_t keylen;
14391 PyObject *key;
14392 int pcount = 1;
14393
14394 if (ctx->dict == NULL) {
14395 PyErr_SetString(PyExc_TypeError,
14396 "format requires a mapping");
14397 return -1;
14398 }
14399 ++ctx->fmtpos;
14400 --ctx->fmtcnt;
14401 keystart = ctx->fmtpos;
14402 /* Skip over balanced parentheses */
14403 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14404 arg->ch = FORMAT_READ(ctx);
14405 if (arg->ch == ')')
14406 --pcount;
14407 else if (arg->ch == '(')
14408 ++pcount;
14409 ctx->fmtpos++;
14410 }
14411 keylen = ctx->fmtpos - keystart - 1;
14412 if (ctx->fmtcnt < 0 || pcount > 0) {
14413 PyErr_SetString(PyExc_ValueError,
14414 "incomplete format key");
14415 return -1;
14416 }
14417 key = PyUnicode_Substring(ctx->fmtstr,
14418 keystart, keystart + keylen);
14419 if (key == NULL)
14420 return -1;
14421 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014422 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014423 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014424 }
14425 ctx->args = PyObject_GetItem(ctx->dict, key);
14426 Py_DECREF(key);
14427 if (ctx->args == NULL)
14428 return -1;
14429 ctx->args_owned = 1;
14430 ctx->arglen = -1;
14431 ctx->argidx = -2;
14432 }
14433
14434 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014435 while (--ctx->fmtcnt >= 0) {
14436 arg->ch = FORMAT_READ(ctx);
14437 ctx->fmtpos++;
14438 switch (arg->ch) {
14439 case '-': arg->flags |= F_LJUST; continue;
14440 case '+': arg->flags |= F_SIGN; continue;
14441 case ' ': arg->flags |= F_BLANK; continue;
14442 case '#': arg->flags |= F_ALT; continue;
14443 case '0': arg->flags |= F_ZERO; continue;
14444 }
14445 break;
14446 }
14447
14448 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014449 if (arg->ch == '*') {
14450 v = unicode_format_getnextarg(ctx);
14451 if (v == NULL)
14452 return -1;
14453 if (!PyLong_Check(v)) {
14454 PyErr_SetString(PyExc_TypeError,
14455 "* wants int");
14456 return -1;
14457 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014458 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014459 if (arg->width == -1 && PyErr_Occurred())
14460 return -1;
14461 if (arg->width < 0) {
14462 arg->flags |= F_LJUST;
14463 arg->width = -arg->width;
14464 }
14465 if (--ctx->fmtcnt >= 0) {
14466 arg->ch = FORMAT_READ(ctx);
14467 ctx->fmtpos++;
14468 }
14469 }
14470 else if (arg->ch >= '0' && arg->ch <= '9') {
14471 arg->width = arg->ch - '0';
14472 while (--ctx->fmtcnt >= 0) {
14473 arg->ch = FORMAT_READ(ctx);
14474 ctx->fmtpos++;
14475 if (arg->ch < '0' || arg->ch > '9')
14476 break;
14477 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14478 mixing signed and unsigned comparison. Since arg->ch is between
14479 '0' and '9', casting to int is safe. */
14480 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14481 PyErr_SetString(PyExc_ValueError,
14482 "width too big");
14483 return -1;
14484 }
14485 arg->width = arg->width*10 + (arg->ch - '0');
14486 }
14487 }
14488
14489 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014490 if (arg->ch == '.') {
14491 arg->prec = 0;
14492 if (--ctx->fmtcnt >= 0) {
14493 arg->ch = FORMAT_READ(ctx);
14494 ctx->fmtpos++;
14495 }
14496 if (arg->ch == '*') {
14497 v = unicode_format_getnextarg(ctx);
14498 if (v == NULL)
14499 return -1;
14500 if (!PyLong_Check(v)) {
14501 PyErr_SetString(PyExc_TypeError,
14502 "* wants int");
14503 return -1;
14504 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014505 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014506 if (arg->prec == -1 && PyErr_Occurred())
14507 return -1;
14508 if (arg->prec < 0)
14509 arg->prec = 0;
14510 if (--ctx->fmtcnt >= 0) {
14511 arg->ch = FORMAT_READ(ctx);
14512 ctx->fmtpos++;
14513 }
14514 }
14515 else if (arg->ch >= '0' && arg->ch <= '9') {
14516 arg->prec = arg->ch - '0';
14517 while (--ctx->fmtcnt >= 0) {
14518 arg->ch = FORMAT_READ(ctx);
14519 ctx->fmtpos++;
14520 if (arg->ch < '0' || arg->ch > '9')
14521 break;
14522 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14523 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014524 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014525 return -1;
14526 }
14527 arg->prec = arg->prec*10 + (arg->ch - '0');
14528 }
14529 }
14530 }
14531
14532 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14533 if (ctx->fmtcnt >= 0) {
14534 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14535 if (--ctx->fmtcnt >= 0) {
14536 arg->ch = FORMAT_READ(ctx);
14537 ctx->fmtpos++;
14538 }
14539 }
14540 }
14541 if (ctx->fmtcnt < 0) {
14542 PyErr_SetString(PyExc_ValueError,
14543 "incomplete format");
14544 return -1;
14545 }
14546 return 0;
14547
14548#undef FORMAT_READ
14549}
14550
14551/* Format one argument. Supported conversion specifiers:
14552
14553 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014554 - "i", "d", "u": int or float
14555 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014556 - "e", "E", "f", "F", "g", "G": float
14557 - "c": int or str (1 character)
14558
Victor Stinner8dbd4212012-12-04 09:30:24 +010014559 When possible, the output is written directly into the Unicode writer
14560 (ctx->writer). A string is created when padding is required.
14561
Victor Stinnera47082312012-10-04 02:19:54 +020014562 Return 0 if the argument has been formatted into *p_str,
14563 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014564 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014565static int
14566unicode_format_arg_format(struct unicode_formatter_t *ctx,
14567 struct unicode_format_arg_t *arg,
14568 PyObject **p_str)
14569{
14570 PyObject *v;
14571 _PyUnicodeWriter *writer = &ctx->writer;
14572
14573 if (ctx->fmtcnt == 0)
14574 ctx->writer.overallocate = 0;
14575
Victor Stinnera47082312012-10-04 02:19:54 +020014576 v = unicode_format_getnextarg(ctx);
14577 if (v == NULL)
14578 return -1;
14579
Victor Stinnera47082312012-10-04 02:19:54 +020014580
14581 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014582 case 's':
14583 case 'r':
14584 case 'a':
14585 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14586 /* Fast path */
14587 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14588 return -1;
14589 return 1;
14590 }
14591
14592 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14593 *p_str = v;
14594 Py_INCREF(*p_str);
14595 }
14596 else {
14597 if (arg->ch == 's')
14598 *p_str = PyObject_Str(v);
14599 else if (arg->ch == 'r')
14600 *p_str = PyObject_Repr(v);
14601 else
14602 *p_str = PyObject_ASCII(v);
14603 }
14604 break;
14605
14606 case 'i':
14607 case 'd':
14608 case 'u':
14609 case 'o':
14610 case 'x':
14611 case 'X':
14612 {
14613 int ret = mainformatlong(v, arg, p_str, writer);
14614 if (ret != 0)
14615 return ret;
14616 arg->sign = 1;
14617 break;
14618 }
14619
14620 case 'e':
14621 case 'E':
14622 case 'f':
14623 case 'F':
14624 case 'g':
14625 case 'G':
14626 if (arg->width == -1 && arg->prec == -1
14627 && !(arg->flags & (F_SIGN | F_BLANK)))
14628 {
14629 /* Fast path */
14630 if (formatfloat(v, arg, NULL, writer) == -1)
14631 return -1;
14632 return 1;
14633 }
14634
14635 arg->sign = 1;
14636 if (formatfloat(v, arg, p_str, NULL) == -1)
14637 return -1;
14638 break;
14639
14640 case 'c':
14641 {
14642 Py_UCS4 ch = formatchar(v);
14643 if (ch == (Py_UCS4) -1)
14644 return -1;
14645 if (arg->width == -1 && arg->prec == -1) {
14646 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014647 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014648 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014649 return 1;
14650 }
14651 *p_str = PyUnicode_FromOrdinal(ch);
14652 break;
14653 }
14654
14655 default:
14656 PyErr_Format(PyExc_ValueError,
14657 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014658 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014659 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14660 (int)arg->ch,
14661 ctx->fmtpos - 1);
14662 return -1;
14663 }
14664 if (*p_str == NULL)
14665 return -1;
14666 assert (PyUnicode_Check(*p_str));
14667 return 0;
14668}
14669
14670static int
14671unicode_format_arg_output(struct unicode_formatter_t *ctx,
14672 struct unicode_format_arg_t *arg,
14673 PyObject *str)
14674{
14675 Py_ssize_t len;
14676 enum PyUnicode_Kind kind;
14677 void *pbuf;
14678 Py_ssize_t pindex;
14679 Py_UCS4 signchar;
14680 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014681 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014682 Py_ssize_t sublen;
14683 _PyUnicodeWriter *writer = &ctx->writer;
14684 Py_UCS4 fill;
14685
14686 fill = ' ';
14687 if (arg->sign && arg->flags & F_ZERO)
14688 fill = '0';
14689
14690 if (PyUnicode_READY(str) == -1)
14691 return -1;
14692
14693 len = PyUnicode_GET_LENGTH(str);
14694 if ((arg->width == -1 || arg->width <= len)
14695 && (arg->prec == -1 || arg->prec >= len)
14696 && !(arg->flags & (F_SIGN | F_BLANK)))
14697 {
14698 /* Fast path */
14699 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14700 return -1;
14701 return 0;
14702 }
14703
14704 /* Truncate the string for "s", "r" and "a" formats
14705 if the precision is set */
14706 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14707 if (arg->prec >= 0 && len > arg->prec)
14708 len = arg->prec;
14709 }
14710
14711 /* Adjust sign and width */
14712 kind = PyUnicode_KIND(str);
14713 pbuf = PyUnicode_DATA(str);
14714 pindex = 0;
14715 signchar = '\0';
14716 if (arg->sign) {
14717 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14718 if (ch == '-' || ch == '+') {
14719 signchar = ch;
14720 len--;
14721 pindex++;
14722 }
14723 else if (arg->flags & F_SIGN)
14724 signchar = '+';
14725 else if (arg->flags & F_BLANK)
14726 signchar = ' ';
14727 else
14728 arg->sign = 0;
14729 }
14730 if (arg->width < len)
14731 arg->width = len;
14732
14733 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014734 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014735 if (!(arg->flags & F_LJUST)) {
14736 if (arg->sign) {
14737 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014738 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014739 }
14740 else {
14741 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014742 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014743 }
14744 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014745 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14746 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014747 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014748 }
14749
Victor Stinnera47082312012-10-04 02:19:54 +020014750 buflen = arg->width;
14751 if (arg->sign && len == arg->width)
14752 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014753 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014754 return -1;
14755
14756 /* Write the sign if needed */
14757 if (arg->sign) {
14758 if (fill != ' ') {
14759 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14760 writer->pos += 1;
14761 }
14762 if (arg->width > len)
14763 arg->width--;
14764 }
14765
14766 /* Write the numeric prefix for "x", "X" and "o" formats
14767 if the alternate form is used.
14768 For example, write "0x" for the "%#x" format. */
14769 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14770 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14771 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14772 if (fill != ' ') {
14773 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14774 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14775 writer->pos += 2;
14776 pindex += 2;
14777 }
14778 arg->width -= 2;
14779 if (arg->width < 0)
14780 arg->width = 0;
14781 len -= 2;
14782 }
14783
14784 /* Pad left with the fill character if needed */
14785 if (arg->width > len && !(arg->flags & F_LJUST)) {
14786 sublen = arg->width - len;
14787 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14788 writer->pos += sublen;
14789 arg->width = len;
14790 }
14791
14792 /* If padding with spaces: write sign if needed and/or numeric prefix if
14793 the alternate form is used */
14794 if (fill == ' ') {
14795 if (arg->sign) {
14796 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14797 writer->pos += 1;
14798 }
14799 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14800 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14801 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14802 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14803 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14804 writer->pos += 2;
14805 pindex += 2;
14806 }
14807 }
14808
14809 /* Write characters */
14810 if (len) {
14811 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14812 str, pindex, len);
14813 writer->pos += len;
14814 }
14815
14816 /* Pad right with the fill character if needed */
14817 if (arg->width > len) {
14818 sublen = arg->width - len;
14819 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14820 writer->pos += sublen;
14821 }
14822 return 0;
14823}
14824
14825/* Helper of PyUnicode_Format(): format one arg.
14826 Return 0 on success, raise an exception and return -1 on error. */
14827static int
14828unicode_format_arg(struct unicode_formatter_t *ctx)
14829{
14830 struct unicode_format_arg_t arg;
14831 PyObject *str;
14832 int ret;
14833
Victor Stinner8dbd4212012-12-04 09:30:24 +010014834 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014835 if (arg.ch == '%') {
14836 ctx->fmtpos++;
14837 ctx->fmtcnt--;
14838 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14839 return -1;
14840 return 0;
14841 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014842 arg.flags = 0;
14843 arg.width = -1;
14844 arg.prec = -1;
14845 arg.sign = 0;
14846 str = NULL;
14847
Victor Stinnera47082312012-10-04 02:19:54 +020014848 ret = unicode_format_arg_parse(ctx, &arg);
14849 if (ret == -1)
14850 return -1;
14851
14852 ret = unicode_format_arg_format(ctx, &arg, &str);
14853 if (ret == -1)
14854 return -1;
14855
14856 if (ret != 1) {
14857 ret = unicode_format_arg_output(ctx, &arg, str);
14858 Py_DECREF(str);
14859 if (ret == -1)
14860 return -1;
14861 }
14862
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014863 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014864 PyErr_SetString(PyExc_TypeError,
14865 "not all arguments converted during string formatting");
14866 return -1;
14867 }
14868 return 0;
14869}
14870
Alexander Belopolsky40018472011-02-26 01:02:56 +000014871PyObject *
14872PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014873{
Victor Stinnera47082312012-10-04 02:19:54 +020014874 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014875
Guido van Rossumd57fd912000-03-10 22:53:23 +000014876 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014877 PyErr_BadInternalCall();
14878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879 }
Victor Stinnera47082312012-10-04 02:19:54 +020014880
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014881 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014882 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014883
14884 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014885 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14886 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14887 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14888 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014889
Victor Stinner8f674cc2013-04-17 23:02:17 +020014890 _PyUnicodeWriter_Init(&ctx.writer);
14891 ctx.writer.min_length = ctx.fmtcnt + 100;
14892 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014893
Guido van Rossumd57fd912000-03-10 22:53:23 +000014894 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014895 ctx.arglen = PyTuple_Size(args);
14896 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014897 }
14898 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014899 ctx.arglen = -1;
14900 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014901 }
Victor Stinnera47082312012-10-04 02:19:54 +020014902 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014903 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014904 ctx.dict = args;
14905 else
14906 ctx.dict = NULL;
14907 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014908
Victor Stinnera47082312012-10-04 02:19:54 +020014909 while (--ctx.fmtcnt >= 0) {
14910 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014911 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014912
14913 nonfmtpos = ctx.fmtpos++;
14914 while (ctx.fmtcnt >= 0 &&
14915 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14916 ctx.fmtpos++;
14917 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014918 }
Victor Stinnera47082312012-10-04 02:19:54 +020014919 if (ctx.fmtcnt < 0) {
14920 ctx.fmtpos--;
14921 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014922 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014923
Victor Stinnercfc4c132013-04-03 01:48:39 +020014924 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14925 nonfmtpos, ctx.fmtpos) < 0)
14926 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014927 }
14928 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014929 ctx.fmtpos++;
14930 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014931 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014932 }
14933 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014934
Victor Stinnera47082312012-10-04 02:19:54 +020014935 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014936 PyErr_SetString(PyExc_TypeError,
14937 "not all arguments converted during string formatting");
14938 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014939 }
14940
Victor Stinnera47082312012-10-04 02:19:54 +020014941 if (ctx.args_owned) {
14942 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 }
Victor Stinnera47082312012-10-04 02:19:54 +020014944 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014945
Benjamin Peterson29060642009-01-31 22:14:21 +000014946 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014947 _PyUnicodeWriter_Dealloc(&ctx.writer);
14948 if (ctx.args_owned) {
14949 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014950 }
14951 return NULL;
14952}
14953
Jeremy Hylton938ace62002-07-17 16:30:39 +000014954static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014955unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14956
Tim Peters6d6c1a32001-08-02 04:15:00 +000014957static PyObject *
14958unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14959{
Benjamin Peterson29060642009-01-31 22:14:21 +000014960 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014961 static char *kwlist[] = {"object", "encoding", "errors", 0};
14962 char *encoding = NULL;
14963 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014964
Benjamin Peterson14339b62009-01-31 16:36:08 +000014965 if (type != &PyUnicode_Type)
14966 return unicode_subtype_new(type, args, kwds);
14967 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014968 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 return NULL;
14970 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014971 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 if (encoding == NULL && errors == NULL)
14973 return PyObject_Str(x);
14974 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014975 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014976}
14977
Guido van Rossume023fe02001-08-30 03:12:59 +000014978static PyObject *
14979unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14980{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014981 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014982 Py_ssize_t length, char_size;
14983 int share_wstr, share_utf8;
14984 unsigned int kind;
14985 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014986
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014988
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014989 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014990 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014992 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014993 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014994 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014995 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014996 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014997
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014998 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014999 if (self == NULL) {
15000 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015001 return NULL;
15002 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015003 kind = PyUnicode_KIND(unicode);
15004 length = PyUnicode_GET_LENGTH(unicode);
15005
15006 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015007#ifdef Py_DEBUG
15008 _PyUnicode_HASH(self) = -1;
15009#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015010 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015011#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015012 _PyUnicode_STATE(self).interned = 0;
15013 _PyUnicode_STATE(self).kind = kind;
15014 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015015 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015016 _PyUnicode_STATE(self).ready = 1;
15017 _PyUnicode_WSTR(self) = NULL;
15018 _PyUnicode_UTF8_LENGTH(self) = 0;
15019 _PyUnicode_UTF8(self) = NULL;
15020 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015021 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015022
15023 share_utf8 = 0;
15024 share_wstr = 0;
15025 if (kind == PyUnicode_1BYTE_KIND) {
15026 char_size = 1;
15027 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15028 share_utf8 = 1;
15029 }
15030 else if (kind == PyUnicode_2BYTE_KIND) {
15031 char_size = 2;
15032 if (sizeof(wchar_t) == 2)
15033 share_wstr = 1;
15034 }
15035 else {
15036 assert(kind == PyUnicode_4BYTE_KIND);
15037 char_size = 4;
15038 if (sizeof(wchar_t) == 4)
15039 share_wstr = 1;
15040 }
15041
15042 /* Ensure we won't overflow the length. */
15043 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15044 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015045 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047 data = PyObject_MALLOC((length + 1) * char_size);
15048 if (data == NULL) {
15049 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015050 goto onError;
15051 }
15052
Victor Stinnerc3c74152011-10-02 20:39:55 +020015053 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 if (share_utf8) {
15055 _PyUnicode_UTF8_LENGTH(self) = length;
15056 _PyUnicode_UTF8(self) = data;
15057 }
15058 if (share_wstr) {
15059 _PyUnicode_WSTR_LENGTH(self) = length;
15060 _PyUnicode_WSTR(self) = (wchar_t *)data;
15061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015062
Christian Heimesf051e432016-09-13 20:22:02 +020015063 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015064 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015065 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015066#ifdef Py_DEBUG
15067 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15068#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015069 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015070 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015071
15072onError:
15073 Py_DECREF(unicode);
15074 Py_DECREF(self);
15075 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015076}
15077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015078PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015079"str(object='') -> str\n\
15080str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015081\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015082Create a new string object from the given object. If encoding or\n\
15083errors is specified, then the object must expose a data buffer\n\
15084that will be decoded using the given encoding and error handler.\n\
15085Otherwise, returns the result of object.__str__() (if defined)\n\
15086or repr(object).\n\
15087encoding defaults to sys.getdefaultencoding().\n\
15088errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015089
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015090static PyObject *unicode_iter(PyObject *seq);
15091
Guido van Rossumd57fd912000-03-10 22:53:23 +000015092PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015093 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015094 "str", /* tp_name */
15095 sizeof(PyUnicodeObject), /* tp_basicsize */
15096 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015097 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015098 (destructor)unicode_dealloc, /* tp_dealloc */
15099 0, /* tp_print */
15100 0, /* tp_getattr */
15101 0, /* tp_setattr */
15102 0, /* tp_reserved */
15103 unicode_repr, /* tp_repr */
15104 &unicode_as_number, /* tp_as_number */
15105 &unicode_as_sequence, /* tp_as_sequence */
15106 &unicode_as_mapping, /* tp_as_mapping */
15107 (hashfunc) unicode_hash, /* tp_hash*/
15108 0, /* tp_call*/
15109 (reprfunc) unicode_str, /* tp_str */
15110 PyObject_GenericGetAttr, /* tp_getattro */
15111 0, /* tp_setattro */
15112 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015114 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15115 unicode_doc, /* tp_doc */
15116 0, /* tp_traverse */
15117 0, /* tp_clear */
15118 PyUnicode_RichCompare, /* tp_richcompare */
15119 0, /* tp_weaklistoffset */
15120 unicode_iter, /* tp_iter */
15121 0, /* tp_iternext */
15122 unicode_methods, /* tp_methods */
15123 0, /* tp_members */
15124 0, /* tp_getset */
15125 &PyBaseObject_Type, /* tp_base */
15126 0, /* tp_dict */
15127 0, /* tp_descr_get */
15128 0, /* tp_descr_set */
15129 0, /* tp_dictoffset */
15130 0, /* tp_init */
15131 0, /* tp_alloc */
15132 unicode_new, /* tp_new */
15133 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015134};
15135
15136/* Initialize the Unicode implementation */
15137
Victor Stinner3a50e702011-10-18 21:21:00 +020015138int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015139{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015140 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015141 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015142 0x000A, /* LINE FEED */
15143 0x000D, /* CARRIAGE RETURN */
15144 0x001C, /* FILE SEPARATOR */
15145 0x001D, /* GROUP SEPARATOR */
15146 0x001E, /* RECORD SEPARATOR */
15147 0x0085, /* NEXT LINE */
15148 0x2028, /* LINE SEPARATOR */
15149 0x2029, /* PARAGRAPH SEPARATOR */
15150 };
15151
Fred Drakee4315f52000-05-09 19:53:39 +000015152 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015153 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015154 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015155 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015156 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015157
Guido van Rossumcacfc072002-05-24 19:01:59 +000015158 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015159 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015160
15161 /* initialize the linebreak bloom filter */
15162 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015163 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015164 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015165
Christian Heimes26532f72013-07-20 14:57:16 +020015166 if (PyType_Ready(&EncodingMapType) < 0)
15167 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015168
Benjamin Petersonc4311282012-10-30 23:21:10 -040015169 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15170 Py_FatalError("Can't initialize field name iterator type");
15171
15172 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15173 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015174
Victor Stinner3a50e702011-10-18 21:21:00 +020015175 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015176}
15177
15178/* Finalize the Unicode implementation */
15179
Christian Heimesa156e092008-02-16 07:38:31 +000015180int
15181PyUnicode_ClearFreeList(void)
15182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015183 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015184}
15185
Guido van Rossumd57fd912000-03-10 22:53:23 +000015186void
Thomas Wouters78890102000-07-22 19:25:51 +000015187_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015188{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015189 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015190
Serhiy Storchaka05997252013-01-26 12:14:02 +020015191 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015192
Serhiy Storchaka05997252013-01-26 12:14:02 +020015193 for (i = 0; i < 256; i++)
15194 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015195 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015196 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015197}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015198
Walter Dörwald16807132007-05-25 13:52:07 +000015199void
15200PyUnicode_InternInPlace(PyObject **p)
15201{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015202 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015204#ifdef Py_DEBUG
15205 assert(s != NULL);
15206 assert(_PyUnicode_CHECK(s));
15207#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015209 return;
15210#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015211 /* If it's a subclass, we don't really know what putting
15212 it in the interned dict might do. */
15213 if (!PyUnicode_CheckExact(s))
15214 return;
15215 if (PyUnicode_CHECK_INTERNED(s))
15216 return;
15217 if (interned == NULL) {
15218 interned = PyDict_New();
15219 if (interned == NULL) {
15220 PyErr_Clear(); /* Don't leave an exception */
15221 return;
15222 }
15223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015224 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015225 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015227 if (t == NULL) {
15228 PyErr_Clear();
15229 return;
15230 }
15231 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015232 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015233 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015234 return;
15235 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 /* The two references in interned are not counted by refcnt.
15237 The deallocator will take care of this */
15238 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015239 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015240}
15241
15242void
15243PyUnicode_InternImmortal(PyObject **p)
15244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 PyUnicode_InternInPlace(p);
15246 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015247 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 Py_INCREF(*p);
15249 }
Walter Dörwald16807132007-05-25 13:52:07 +000015250}
15251
15252PyObject *
15253PyUnicode_InternFromString(const char *cp)
15254{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015255 PyObject *s = PyUnicode_FromString(cp);
15256 if (s == NULL)
15257 return NULL;
15258 PyUnicode_InternInPlace(&s);
15259 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015260}
15261
Alexander Belopolsky40018472011-02-26 01:02:56 +000015262void
15263_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015264{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015266 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 Py_ssize_t i, n;
15268 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015269
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 if (interned == NULL || !PyDict_Check(interned))
15271 return;
15272 keys = PyDict_Keys(interned);
15273 if (keys == NULL || !PyList_Check(keys)) {
15274 PyErr_Clear();
15275 return;
15276 }
Walter Dörwald16807132007-05-25 13:52:07 +000015277
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15279 detector, interned unicode strings are not forcibly deallocated;
15280 rather, we give them their stolen references back, and then clear
15281 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015282
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 n = PyList_GET_SIZE(keys);
15284 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015285 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015287 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015288 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015289 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015291 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 case SSTATE_NOT_INTERNED:
15293 /* XXX Shouldn't happen */
15294 break;
15295 case SSTATE_INTERNED_IMMORTAL:
15296 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015297 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 break;
15299 case SSTATE_INTERNED_MORTAL:
15300 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015301 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 break;
15303 default:
15304 Py_FatalError("Inconsistent interned string state.");
15305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015306 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 }
15308 fprintf(stderr, "total size of all interned strings: "
15309 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15310 "mortal/immortal\n", mortal_size, immortal_size);
15311 Py_DECREF(keys);
15312 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015313 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015314}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015315
15316
15317/********************* Unicode Iterator **************************/
15318
15319typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 PyObject_HEAD
15321 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015322 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015323} unicodeiterobject;
15324
15325static void
15326unicodeiter_dealloc(unicodeiterobject *it)
15327{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 _PyObject_GC_UNTRACK(it);
15329 Py_XDECREF(it->it_seq);
15330 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015331}
15332
15333static int
15334unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 Py_VISIT(it->it_seq);
15337 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015338}
15339
15340static PyObject *
15341unicodeiter_next(unicodeiterobject *it)
15342{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015343 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015344
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 assert(it != NULL);
15346 seq = it->it_seq;
15347 if (seq == NULL)
15348 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015349 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15352 int kind = PyUnicode_KIND(seq);
15353 void *data = PyUnicode_DATA(seq);
15354 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15355 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 if (item != NULL)
15357 ++it->it_index;
15358 return item;
15359 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015360
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015362 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015364}
15365
15366static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015367unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 Py_ssize_t len = 0;
15370 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015371 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015372 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015373}
15374
15375PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15376
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015377static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015378unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015379{
15380 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015381 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015382 it->it_seq, it->it_index);
15383 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015384 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015385 if (u == NULL)
15386 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015387 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015388 }
15389}
15390
15391PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15392
15393static PyObject *
15394unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15395{
15396 Py_ssize_t index = PyLong_AsSsize_t(state);
15397 if (index == -1 && PyErr_Occurred())
15398 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015399 if (it->it_seq != NULL) {
15400 if (index < 0)
15401 index = 0;
15402 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15403 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15404 it->it_index = index;
15405 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015406 Py_RETURN_NONE;
15407}
15408
15409PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15410
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015411static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015413 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015414 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15415 reduce_doc},
15416 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15417 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015419};
15420
15421PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15423 "str_iterator", /* tp_name */
15424 sizeof(unicodeiterobject), /* tp_basicsize */
15425 0, /* tp_itemsize */
15426 /* methods */
15427 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15428 0, /* tp_print */
15429 0, /* tp_getattr */
15430 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015431 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 0, /* tp_repr */
15433 0, /* tp_as_number */
15434 0, /* tp_as_sequence */
15435 0, /* tp_as_mapping */
15436 0, /* tp_hash */
15437 0, /* tp_call */
15438 0, /* tp_str */
15439 PyObject_GenericGetAttr, /* tp_getattro */
15440 0, /* tp_setattro */
15441 0, /* tp_as_buffer */
15442 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15443 0, /* tp_doc */
15444 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15445 0, /* tp_clear */
15446 0, /* tp_richcompare */
15447 0, /* tp_weaklistoffset */
15448 PyObject_SelfIter, /* tp_iter */
15449 (iternextfunc)unicodeiter_next, /* tp_iternext */
15450 unicodeiter_methods, /* tp_methods */
15451 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015452};
15453
15454static PyObject *
15455unicode_iter(PyObject *seq)
15456{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015457 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015458
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 if (!PyUnicode_Check(seq)) {
15460 PyErr_BadInternalCall();
15461 return NULL;
15462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015463 if (PyUnicode_READY(seq) == -1)
15464 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15466 if (it == NULL)
15467 return NULL;
15468 it->it_index = 0;
15469 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015470 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015471 _PyObject_GC_TRACK(it);
15472 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015473}
15474
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015475
15476size_t
15477Py_UNICODE_strlen(const Py_UNICODE *u)
15478{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015479 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015480}
15481
15482Py_UNICODE*
15483Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15484{
15485 Py_UNICODE *u = s1;
15486 while ((*u++ = *s2++));
15487 return s1;
15488}
15489
15490Py_UNICODE*
15491Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15492{
15493 Py_UNICODE *u = s1;
15494 while ((*u++ = *s2++))
15495 if (n-- == 0)
15496 break;
15497 return s1;
15498}
15499
15500Py_UNICODE*
15501Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15502{
15503 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015504 u1 += wcslen(u1);
15505 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015506 return s1;
15507}
15508
15509int
15510Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15511{
15512 while (*s1 && *s2 && *s1 == *s2)
15513 s1++, s2++;
15514 if (*s1 && *s2)
15515 return (*s1 < *s2) ? -1 : +1;
15516 if (*s1)
15517 return 1;
15518 if (*s2)
15519 return -1;
15520 return 0;
15521}
15522
15523int
15524Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15525{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015526 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015527 for (; n != 0; n--) {
15528 u1 = *s1;
15529 u2 = *s2;
15530 if (u1 != u2)
15531 return (u1 < u2) ? -1 : +1;
15532 if (u1 == '\0')
15533 return 0;
15534 s1++;
15535 s2++;
15536 }
15537 return 0;
15538}
15539
15540Py_UNICODE*
15541Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15542{
15543 const Py_UNICODE *p;
15544 for (p = s; *p; p++)
15545 if (*p == c)
15546 return (Py_UNICODE*)p;
15547 return NULL;
15548}
15549
15550Py_UNICODE*
15551Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15552{
15553 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015554 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015555 while (p != s) {
15556 p--;
15557 if (*p == c)
15558 return (Py_UNICODE*)p;
15559 }
15560 return NULL;
15561}
Victor Stinner331ea922010-08-10 16:37:20 +000015562
Victor Stinner71133ff2010-09-01 23:43:53 +000015563Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015564PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015565{
Victor Stinner577db2c2011-10-11 22:12:48 +020015566 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015567 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015569 if (!PyUnicode_Check(unicode)) {
15570 PyErr_BadArgument();
15571 return NULL;
15572 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015573 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015574 if (u == NULL)
15575 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015576 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015577 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015578 PyErr_NoMemory();
15579 return NULL;
15580 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015581 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015582 size *= sizeof(Py_UNICODE);
15583 copy = PyMem_Malloc(size);
15584 if (copy == NULL) {
15585 PyErr_NoMemory();
15586 return NULL;
15587 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015588 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015589 return copy;
15590}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015591
Georg Brandl66c221e2010-10-14 07:04:07 +000015592/* A _string module, to export formatter_parser and formatter_field_name_split
15593 to the string.Formatter class implemented in Python. */
15594
15595static PyMethodDef _string_methods[] = {
15596 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15597 METH_O, PyDoc_STR("split the argument as a field name")},
15598 {"formatter_parser", (PyCFunction) formatter_parser,
15599 METH_O, PyDoc_STR("parse the argument as a format string")},
15600 {NULL, NULL}
15601};
15602
15603static struct PyModuleDef _string_module = {
15604 PyModuleDef_HEAD_INIT,
15605 "_string",
15606 PyDoc_STR("string helper module"),
15607 0,
15608 _string_methods,
15609 NULL,
15610 NULL,
15611 NULL,
15612 NULL
15613};
15614
15615PyMODINIT_FUNC
15616PyInit__string(void)
15617{
15618 return PyModule_Create(&_string_module);
15619}
15620
15621
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015622#ifdef __cplusplus
15623}
15624#endif