blob: bb1c0830fc5d8a6cea255669e8f4a4c750689781 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner50149202015-09-22 00:26:54 +0200321typedef enum {
322 _Py_ERROR_UNKNOWN=0,
323 _Py_ERROR_STRICT,
324 _Py_ERROR_SURROGATEESCAPE,
325 _Py_ERROR_REPLACE,
326 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200327 _Py_ERROR_BACKSLASHREPLACE,
328 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200329 _Py_ERROR_XMLCHARREFREPLACE,
330 _Py_ERROR_OTHER
331} _Py_error_handler;
332
333static _Py_error_handler
334get_error_handler(const char *errors)
335{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200337 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
345 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200346 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200347 }
348 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200349 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200350 }
351 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200353 }
354 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200355 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200356 }
Victor Stinner50149202015-09-22 00:26:54 +0200357 return _Py_ERROR_OTHER;
358}
359
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300360/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
361 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000362Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000363PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000364{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000365#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000367#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000368 /* This is actually an illegal character, so it should
369 not be passed to unichr. */
370 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000371#endif
372}
373
Victor Stinner910337b2011-10-03 03:20:16 +0200374#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200375int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100376_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200377{
378 PyASCIIObject *ascii;
379 unsigned int kind;
380
381 assert(PyUnicode_Check(op));
382
383 ascii = (PyASCIIObject *)op;
384 kind = ascii->state.kind;
385
Victor Stinnera3b334d2011-10-03 13:53:37 +0200386 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200388 assert(ascii->state.ready == 1);
389 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200391 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200392 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200393
Victor Stinnera41463c2011-10-04 01:05:08 +0200394 if (ascii->state.compact == 1) {
395 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200396 assert(kind == PyUnicode_1BYTE_KIND
397 || kind == PyUnicode_2BYTE_KIND
398 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200399 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200400 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200401 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100402 }
403 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200404 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
405
406 data = unicode->data.any;
407 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100408 assert(ascii->length == 0);
409 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200410 assert(ascii->state.compact == 0);
411 assert(ascii->state.ascii == 0);
412 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100413 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200414 assert(ascii->wstr != NULL);
415 assert(data == NULL);
416 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 }
418 else {
419 assert(kind == PyUnicode_1BYTE_KIND
420 || kind == PyUnicode_2BYTE_KIND
421 || kind == PyUnicode_4BYTE_KIND);
422 assert(ascii->state.compact == 0);
423 assert(ascii->state.ready == 1);
424 assert(data != NULL);
425 if (ascii->state.ascii) {
426 assert (compact->utf8 == data);
427 assert (compact->utf8_length == ascii->length);
428 }
429 else
430 assert (compact->utf8 != data);
431 }
432 }
433 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200434 if (
435#if SIZEOF_WCHAR_T == 2
436 kind == PyUnicode_2BYTE_KIND
437#else
438 kind == PyUnicode_4BYTE_KIND
439#endif
440 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200441 {
442 assert(ascii->wstr == data);
443 assert(compact->wstr_length == ascii->length);
444 } else
445 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200446 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200447
448 if (compact->utf8 == NULL)
449 assert(compact->utf8_length == 0);
450 if (ascii->wstr == NULL)
451 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200452 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 /* check that the best kind is used */
454 if (check_content && kind != PyUnicode_WCHAR_KIND)
455 {
456 Py_ssize_t i;
457 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200458 void *data;
459 Py_UCS4 ch;
460
461 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200462 for (i=0; i < ascii->length; i++)
463 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200464 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 if (ch > maxchar)
466 maxchar = ch;
467 }
468 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100469 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200470 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 assert(maxchar <= 255);
472 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200473 else
474 assert(maxchar < 128);
475 }
Victor Stinner77faf692011-11-20 18:56:05 +0100476 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200477 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100478 assert(maxchar <= 0xFFFF);
479 }
480 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200481 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100482 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100483 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200484 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200485 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400486 return 1;
487}
Victor Stinner910337b2011-10-03 03:20:16 +0200488#endif
489
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490static PyObject*
491unicode_result_wchar(PyObject *unicode)
492{
493#ifndef Py_DEBUG
494 Py_ssize_t len;
495
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 len = _PyUnicode_WSTR_LENGTH(unicode);
497 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100498 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100500 }
501
502 if (len == 1) {
503 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100504 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100505 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
506 Py_DECREF(unicode);
507 return latin1_char;
508 }
509 }
510
511 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200512 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100513 return NULL;
514 }
515#else
Victor Stinneraa771272012-10-04 02:32:58 +0200516 assert(Py_REFCNT(unicode) == 1);
517
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 /* don't make the result ready in debug mode to ensure that the caller
519 makes the string ready before using it */
520 assert(_PyUnicode_CheckConsistency(unicode, 1));
521#endif
522 return unicode;
523}
524
525static PyObject*
526unicode_result_ready(PyObject *unicode)
527{
528 Py_ssize_t length;
529
530 length = PyUnicode_GET_LENGTH(unicode);
531 if (length == 0) {
532 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200534 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100535 }
536 return unicode_empty;
537 }
538
539 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200540 void *data = PyUnicode_DATA(unicode);
541 int kind = PyUnicode_KIND(unicode);
542 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100543 if (ch < 256) {
544 PyObject *latin1_char = unicode_latin1[ch];
545 if (latin1_char != NULL) {
546 if (unicode != latin1_char) {
547 Py_INCREF(latin1_char);
548 Py_DECREF(unicode);
549 }
550 return latin1_char;
551 }
552 else {
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 Py_INCREF(unicode);
555 unicode_latin1[ch] = unicode;
556 return unicode;
557 }
558 }
559 }
560
561 assert(_PyUnicode_CheckConsistency(unicode, 1));
562 return unicode;
563}
564
565static PyObject*
566unicode_result(PyObject *unicode)
567{
568 assert(_PyUnicode_CHECK(unicode));
569 if (PyUnicode_IS_READY(unicode))
570 return unicode_result_ready(unicode);
571 else
572 return unicode_result_wchar(unicode);
573}
574
Victor Stinnerc4b49542011-12-11 22:44:26 +0100575static PyObject*
576unicode_result_unchanged(PyObject *unicode)
577{
578 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500579 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100580 return NULL;
581 Py_INCREF(unicode);
582 return unicode;
583 }
584 else
585 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100586 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100587}
588
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200589/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
590 ASCII, Latin1, UTF-8, etc. */
591static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200592backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
594{
Victor Stinnerad771582015-10-09 12:38:53 +0200595 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200596 Py_UCS4 ch;
597 enum PyUnicode_Kind kind;
598 void *data;
599
600 assert(PyUnicode_IS_READY(unicode));
601 kind = PyUnicode_KIND(unicode);
602 data = PyUnicode_DATA(unicode);
603
604 size = 0;
605 /* determine replacement size */
606 for (i = collstart; i < collend; ++i) {
607 Py_ssize_t incr;
608
609 ch = PyUnicode_READ(kind, data, i);
610 if (ch < 0x100)
611 incr = 2+2;
612 else if (ch < 0x10000)
613 incr = 2+4;
614 else {
615 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200616 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200617 }
618 if (size > PY_SSIZE_T_MAX - incr) {
619 PyErr_SetString(PyExc_OverflowError,
620 "encoded result is too long for a Python string");
621 return NULL;
622 }
623 size += incr;
624 }
625
Victor Stinnerad771582015-10-09 12:38:53 +0200626 str = _PyBytesWriter_Prepare(writer, str, size);
627 if (str == NULL)
628 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629
630 /* generate replacement */
631 for (i = collstart; i < collend; ++i) {
632 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200633 *str++ = '\\';
634 if (ch >= 0x00010000) {
635 *str++ = 'U';
636 *str++ = Py_hexdigits[(ch>>28)&0xf];
637 *str++ = Py_hexdigits[(ch>>24)&0xf];
638 *str++ = Py_hexdigits[(ch>>20)&0xf];
639 *str++ = Py_hexdigits[(ch>>16)&0xf];
640 *str++ = Py_hexdigits[(ch>>12)&0xf];
641 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200642 }
Victor Stinner797485e2015-10-09 03:17:30 +0200643 else if (ch >= 0x100) {
644 *str++ = 'u';
645 *str++ = Py_hexdigits[(ch>>12)&0xf];
646 *str++ = Py_hexdigits[(ch>>8)&0xf];
647 }
648 else
649 *str++ = 'x';
650 *str++ = Py_hexdigits[(ch>>4)&0xf];
651 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200652 }
653 return str;
654}
655
656/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 10)
678 incr = 2+1+1;
679 else if (ch < 100)
680 incr = 2+2+1;
681 else if (ch < 1000)
682 incr = 2+3+1;
683 else if (ch < 10000)
684 incr = 2+4+1;
685 else if (ch < 100000)
686 incr = 2+5+1;
687 else if (ch < 1000000)
688 incr = 2+6+1;
689 else {
690 assert(ch <= MAX_UNICODE);
691 incr = 2+7+1;
692 }
693 if (size > PY_SSIZE_T_MAX - incr) {
694 PyErr_SetString(PyExc_OverflowError,
695 "encoded result is too long for a Python string");
696 return NULL;
697 }
698 size += incr;
699 }
700
Victor Stinnerad771582015-10-09 12:38:53 +0200701 str = _PyBytesWriter_Prepare(writer, str, size);
702 if (str == NULL)
703 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704
705 /* generate replacement */
706 for (i = collstart; i < collend; ++i) {
707 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
708 }
709 return str;
710}
711
Thomas Wouters477c8d52006-05-27 19:21:47 +0000712/* --- Bloom Filters ----------------------------------------------------- */
713
714/* stuff to implement simple "bloom filters" for Unicode characters.
715 to keep things simple, we use a single bitmask, using the least 5
716 bits from each unicode characters as the bit index. */
717
718/* the linebreak mask is set up by Unicode_Init below */
719
Antoine Pitrouf068f942010-01-13 14:19:12 +0000720#if LONG_BIT >= 128
721#define BLOOM_WIDTH 128
722#elif LONG_BIT >= 64
723#define BLOOM_WIDTH 64
724#elif LONG_BIT >= 32
725#define BLOOM_WIDTH 32
726#else
727#error "LONG_BIT is smaller than 32"
728#endif
729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730#define BLOOM_MASK unsigned long
731
Serhiy Storchaka05997252013-01-26 12:14:02 +0200732static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000733
Antoine Pitrouf068f942010-01-13 14:19:12 +0000734#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000735
Benjamin Peterson29060642009-01-31 22:14:21 +0000736#define BLOOM_LINEBREAK(ch) \
737 ((ch) < 128U ? ascii_linebreak[(ch)] : \
738 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000739
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700740static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000742{
Victor Stinnera85af502013-04-09 21:53:54 +0200743#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
744 do { \
745 TYPE *data = (TYPE *)PTR; \
746 TYPE *end = data + LEN; \
747 Py_UCS4 ch; \
748 for (; data != end; data++) { \
749 ch = *data; \
750 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
751 } \
752 break; \
753 } while (0)
754
Thomas Wouters477c8d52006-05-27 19:21:47 +0000755 /* calculate simple bloom-style bitmask for a given unicode string */
756
Antoine Pitrouf068f942010-01-13 14:19:12 +0000757 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000758
759 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200760 switch (kind) {
761 case PyUnicode_1BYTE_KIND:
762 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
763 break;
764 case PyUnicode_2BYTE_KIND:
765 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
766 break;
767 case PyUnicode_4BYTE_KIND:
768 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
769 break;
770 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700771 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200772 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000773 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200774
775#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000776}
777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300778static int
779ensure_unicode(PyObject *obj)
780{
781 if (!PyUnicode_Check(obj)) {
782 PyErr_Format(PyExc_TypeError,
783 "must be str, not %.100s",
784 Py_TYPE(obj)->tp_name);
785 return -1;
786 }
787 return PyUnicode_READY(obj);
788}
789
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200790/* Compilation of templated routines */
791
792#include "stringlib/asciilib.h"
793#include "stringlib/fastsearch.h"
794#include "stringlib/partition.h"
795#include "stringlib/split.h"
796#include "stringlib/count.h"
797#include "stringlib/find.h"
798#include "stringlib/find_max_char.h"
799#include "stringlib/localeutil.h"
800#include "stringlib/undef.h"
801
802#include "stringlib/ucs1lib.h"
803#include "stringlib/fastsearch.h"
804#include "stringlib/partition.h"
805#include "stringlib/split.h"
806#include "stringlib/count.h"
807#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300808#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200809#include "stringlib/find_max_char.h"
810#include "stringlib/localeutil.h"
811#include "stringlib/undef.h"
812
813#include "stringlib/ucs2lib.h"
814#include "stringlib/fastsearch.h"
815#include "stringlib/partition.h"
816#include "stringlib/split.h"
817#include "stringlib/count.h"
818#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300819#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200820#include "stringlib/find_max_char.h"
821#include "stringlib/localeutil.h"
822#include "stringlib/undef.h"
823
824#include "stringlib/ucs4lib.h"
825#include "stringlib/fastsearch.h"
826#include "stringlib/partition.h"
827#include "stringlib/split.h"
828#include "stringlib/count.h"
829#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300830#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200831#include "stringlib/find_max_char.h"
832#include "stringlib/localeutil.h"
833#include "stringlib/undef.h"
834
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200835#include "stringlib/unicodedefs.h"
836#include "stringlib/fastsearch.h"
837#include "stringlib/count.h"
838#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100839#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200840
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841/* --- Unicode Object ----------------------------------------------------- */
842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200844fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700846static inline Py_ssize_t
847findchar(const void *s, int kind,
848 Py_ssize_t size, Py_UCS4 ch,
849 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200851 switch (kind) {
852 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200853 if ((Py_UCS1) ch != ch)
854 return -1;
855 if (direction > 0)
856 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
857 else
858 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200859 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200860 if ((Py_UCS2) ch != ch)
861 return -1;
862 if (direction > 0)
863 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
864 else
865 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200866 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200867 if (direction > 0)
868 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
869 else
870 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200871 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700872 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001522 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001523 }
1524 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001525 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001526 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001527 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001528 Py_ssize_t i;
1529
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 for (i=0; i < how_many; i++) {
1531 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001532 if (ch > to_maxchar)
1533 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001534 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1535 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001536 }
1537 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 return 0;
1539}
1540
Victor Stinnerd3f08822012-05-29 12:57:52 +02001541void
1542_PyUnicode_FastCopyCharacters(
1543 PyObject *to, Py_ssize_t to_start,
1544 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001545{
1546 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1547}
1548
1549Py_ssize_t
1550PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1551 PyObject *from, Py_ssize_t from_start,
1552 Py_ssize_t how_many)
1553{
1554 int err;
1555
1556 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1557 PyErr_BadInternalCall();
1558 return -1;
1559 }
1560
Benjamin Petersonbac79492012-01-14 13:34:47 -05001561 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001563 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return -1;
1565
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001566 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567 PyErr_SetString(PyExc_IndexError, "string index out of range");
1568 return -1;
1569 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001570 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001571 PyErr_SetString(PyExc_IndexError, "string index out of range");
1572 return -1;
1573 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001574 if (how_many < 0) {
1575 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1576 return -1;
1577 }
1578 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001579 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1580 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001581 "Cannot write %zi characters at %zi "
1582 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001583 how_many, to_start, PyUnicode_GET_LENGTH(to));
1584 return -1;
1585 }
1586
1587 if (how_many == 0)
1588 return 0;
1589
Victor Stinner488fa492011-12-12 00:01:39 +01001590 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001591 return -1;
1592
1593 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1594 if (err) {
1595 PyErr_Format(PyExc_SystemError,
1596 "Cannot copy %s characters "
1597 "into a string of %s characters",
1598 unicode_kind_name(from),
1599 unicode_kind_name(to));
1600 return -1;
1601 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001602 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603}
1604
Victor Stinner17222162011-09-28 22:15:37 +02001605/* Find the maximum code point and count the number of surrogate pairs so a
1606 correct string length can be computed before converting a string to UCS4.
1607 This function counts single surrogates as a character and not as a pair.
1608
1609 Return 0 on success, or -1 on error. */
1610static int
1611find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1612 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613{
1614 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001615 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616
Victor Stinnerc53be962011-10-02 21:33:54 +02001617 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618 *num_surrogates = 0;
1619 *maxchar = 0;
1620
1621 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001623 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1624 && (iter+1) < end
1625 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1626 {
1627 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1628 ++(*num_surrogates);
1629 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 }
1631 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001633 {
1634 ch = *iter;
1635 iter++;
1636 }
1637 if (ch > *maxchar) {
1638 *maxchar = ch;
1639 if (*maxchar > MAX_UNICODE) {
1640 PyErr_Format(PyExc_ValueError,
1641 "character U+%x is not in range [U+0000; U+10ffff]",
1642 ch);
1643 return -1;
1644 }
1645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 }
1647 return 0;
1648}
1649
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001650int
1651_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652{
1653 wchar_t *end;
1654 Py_UCS4 maxchar = 0;
1655 Py_ssize_t num_surrogates;
1656#if SIZEOF_WCHAR_T == 2
1657 Py_ssize_t length_wo_surrogates;
1658#endif
1659
Georg Brandl7597add2011-10-05 16:36:47 +02001660 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001661 strings were created using _PyObject_New() and where no canonical
1662 representation (the str field) has been set yet aka strings
1663 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001664 assert(_PyUnicode_CHECK(unicode));
1665 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001667 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001668 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001669 /* Actually, it should neither be interned nor be anything else: */
1670 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001673 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001674 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676
1677 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001678 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1679 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 PyErr_NoMemory();
1681 return -1;
1682 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001683 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 _PyUnicode_WSTR(unicode), end,
1685 PyUnicode_1BYTE_DATA(unicode));
1686 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1687 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1688 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1689 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001690 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001691 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001692 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 }
1699 PyObject_FREE(_PyUnicode_WSTR(unicode));
1700 _PyUnicode_WSTR(unicode) = NULL;
1701 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1702 }
1703 /* In this case we might have to convert down from 4-byte native
1704 wchar_t to 2-byte unicode. */
1705 else if (maxchar < 65536) {
1706 assert(num_surrogates == 0 &&
1707 "FindMaxCharAndNumSurrogatePairs() messed up");
1708
Victor Stinner506f5922011-09-28 22:34:18 +02001709#if SIZEOF_WCHAR_T == 2
1710 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001711 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001712 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001715 _PyUnicode_UTF8(unicode) = NULL;
1716 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001717#else
1718 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001719 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001720 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001721 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001722 PyErr_NoMemory();
1723 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 }
Victor Stinner506f5922011-09-28 22:34:18 +02001725 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1726 _PyUnicode_WSTR(unicode), end,
1727 PyUnicode_2BYTE_DATA(unicode));
1728 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1729 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1730 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001731 _PyUnicode_UTF8(unicode) = NULL;
1732 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001733 PyObject_FREE(_PyUnicode_WSTR(unicode));
1734 _PyUnicode_WSTR(unicode) = NULL;
1735 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1736#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 }
1738 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1739 else {
1740#if SIZEOF_WCHAR_T == 2
1741 /* in case the native representation is 2-bytes, we need to allocate a
1742 new normalized 4-byte version. */
1743 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001744 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1745 PyErr_NoMemory();
1746 return -1;
1747 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001748 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1749 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 PyErr_NoMemory();
1751 return -1;
1752 }
1753 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1754 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001755 _PyUnicode_UTF8(unicode) = NULL;
1756 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001757 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1758 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001759 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 PyObject_FREE(_PyUnicode_WSTR(unicode));
1761 _PyUnicode_WSTR(unicode) = NULL;
1762 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1763#else
1764 assert(num_surrogates == 0);
1765
Victor Stinnerc3c74152011-10-02 20:39:55 +02001766 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001768 _PyUnicode_UTF8(unicode) = NULL;
1769 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1771#endif
1772 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1773 }
1774 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001775 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 return 0;
1777}
1778
Alexander Belopolsky40018472011-02-26 01:02:56 +00001779static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001780unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781{
Walter Dörwald16807132007-05-25 13:52:07 +00001782 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 case SSTATE_NOT_INTERNED:
1784 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001785
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 case SSTATE_INTERNED_MORTAL:
1787 /* revive dead object temporarily for DelItem */
1788 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001789 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001790 Py_FatalError(
1791 "deletion of interned string failed");
1792 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001793
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 case SSTATE_INTERNED_IMMORTAL:
1795 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001796 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002080 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002173 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002174 }
2175}
2176
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002177static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002178align_maxchar(Py_UCS4 maxchar)
2179{
2180 if (maxchar <= 127)
2181 return 127;
2182 else if (maxchar <= 255)
2183 return 255;
2184 else if (maxchar <= 65535)
2185 return 65535;
2186 else
2187 return MAX_UNICODE;
2188}
2189
Victor Stinner702c7342011-10-05 13:50:52 +02002190static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002191_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002194 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002195
Serhiy Storchaka678db842013-01-26 12:16:36 +02002196 if (size == 0)
2197 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002198 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002199 if (size == 1)
2200 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002201
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002202 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002203 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 if (!res)
2205 return NULL;
2206 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002207 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002209}
2210
Victor Stinnere57b1c02011-09-28 22:20:48 +02002211static PyObject*
2212_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213{
2214 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002215 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002216
Serhiy Storchaka678db842013-01-26 12:16:36 +02002217 if (size == 0)
2218 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002219 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002220 if (size == 1)
2221 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002222
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002223 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002224 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 if (!res)
2226 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002227 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002229 else {
2230 _PyUnicode_CONVERT_BYTES(
2231 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2232 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002233 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return res;
2235}
2236
Victor Stinnere57b1c02011-09-28 22:20:48 +02002237static PyObject*
2238_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239{
2240 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002241 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002242
Serhiy Storchaka678db842013-01-26 12:16:36 +02002243 if (size == 0)
2244 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002245 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002246 if (size == 1)
2247 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002249 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002250 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 if (!res)
2252 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002253 if (max_char < 256)
2254 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2255 PyUnicode_1BYTE_DATA(res));
2256 else if (max_char < 0x10000)
2257 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2258 PyUnicode_2BYTE_DATA(res));
2259 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002261 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 return res;
2263}
2264
2265PyObject*
2266PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2267{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002268 if (size < 0) {
2269 PyErr_SetString(PyExc_ValueError, "size must be positive");
2270 return NULL;
2271 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002272 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002274 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002276 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002278 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002279 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 PyErr_SetString(PyExc_SystemError, "invalid kind");
2281 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283}
2284
Victor Stinnerece58de2012-04-23 23:36:38 +02002285Py_UCS4
2286_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2287{
2288 enum PyUnicode_Kind kind;
2289 void *startptr, *endptr;
2290
2291 assert(PyUnicode_IS_READY(unicode));
2292 assert(0 <= start);
2293 assert(end <= PyUnicode_GET_LENGTH(unicode));
2294 assert(start <= end);
2295
2296 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2297 return PyUnicode_MAX_CHAR_VALUE(unicode);
2298
2299 if (start == end)
2300 return 127;
2301
Victor Stinner94d558b2012-04-27 22:26:58 +02002302 if (PyUnicode_IS_ASCII(unicode))
2303 return 127;
2304
Victor Stinnerece58de2012-04-23 23:36:38 +02002305 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002306 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002307 endptr = (char *)startptr + end * kind;
2308 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002309 switch(kind) {
2310 case PyUnicode_1BYTE_KIND:
2311 return ucs1lib_find_max_char(startptr, endptr);
2312 case PyUnicode_2BYTE_KIND:
2313 return ucs2lib_find_max_char(startptr, endptr);
2314 case PyUnicode_4BYTE_KIND:
2315 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002316 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002317 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 }
2319}
2320
Victor Stinner25a4b292011-10-06 12:31:55 +02002321/* Ensure that a string uses the most efficient storage, if it is not the
2322 case: create a new string with of the right kind. Write NULL into *p_unicode
2323 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002324static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002325unicode_adjust_maxchar(PyObject **p_unicode)
2326{
2327 PyObject *unicode, *copy;
2328 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002329 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002330 unsigned int kind;
2331
2332 assert(p_unicode != NULL);
2333 unicode = *p_unicode;
2334 assert(PyUnicode_IS_READY(unicode));
2335 if (PyUnicode_IS_ASCII(unicode))
2336 return;
2337
2338 len = PyUnicode_GET_LENGTH(unicode);
2339 kind = PyUnicode_KIND(unicode);
2340 if (kind == PyUnicode_1BYTE_KIND) {
2341 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002342 max_char = ucs1lib_find_max_char(u, u + len);
2343 if (max_char >= 128)
2344 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002345 }
2346 else if (kind == PyUnicode_2BYTE_KIND) {
2347 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002348 max_char = ucs2lib_find_max_char(u, u + len);
2349 if (max_char >= 256)
2350 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002351 }
2352 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002353 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002354 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 max_char = ucs4lib_find_max_char(u, u + len);
2356 if (max_char >= 0x10000)
2357 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002359 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002360 if (copy != NULL)
2361 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002362 Py_DECREF(unicode);
2363 *p_unicode = copy;
2364}
2365
Victor Stinner034f6cf2011-09-30 02:26:44 +02002366PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002367_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368{
Victor Stinner87af4f22011-11-21 23:03:47 +01002369 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002370 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002371
Victor Stinner034f6cf2011-09-30 02:26:44 +02002372 if (!PyUnicode_Check(unicode)) {
2373 PyErr_BadInternalCall();
2374 return NULL;
2375 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002376 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002377 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002378
Victor Stinner87af4f22011-11-21 23:03:47 +01002379 length = PyUnicode_GET_LENGTH(unicode);
2380 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002381 if (!copy)
2382 return NULL;
2383 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2384
Christian Heimesf051e432016-09-13 20:22:02 +02002385 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002386 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002387 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002388 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389}
2390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391
Victor Stinnerbc603d12011-10-02 01:00:40 +02002392/* Widen Unicode objects to larger buffers. Don't write terminating null
2393 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394
2395void*
2396_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2397{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002398 Py_ssize_t len;
2399 void *result;
2400 unsigned int skind;
2401
Benjamin Petersonbac79492012-01-14 13:34:47 -05002402 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002403 return NULL;
2404
2405 len = PyUnicode_GET_LENGTH(s);
2406 skind = PyUnicode_KIND(s);
2407 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002408 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 return NULL;
2410 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002411 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002412 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002413 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 if (!result)
2415 return PyErr_NoMemory();
2416 assert(skind == PyUnicode_1BYTE_KIND);
2417 _PyUnicode_CONVERT_BYTES(
2418 Py_UCS1, Py_UCS2,
2419 PyUnicode_1BYTE_DATA(s),
2420 PyUnicode_1BYTE_DATA(s) + len,
2421 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002423 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002424 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 if (!result)
2426 return PyErr_NoMemory();
2427 if (skind == PyUnicode_2BYTE_KIND) {
2428 _PyUnicode_CONVERT_BYTES(
2429 Py_UCS2, Py_UCS4,
2430 PyUnicode_2BYTE_DATA(s),
2431 PyUnicode_2BYTE_DATA(s) + len,
2432 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002434 else {
2435 assert(skind == PyUnicode_1BYTE_KIND);
2436 _PyUnicode_CONVERT_BYTES(
2437 Py_UCS1, Py_UCS4,
2438 PyUnicode_1BYTE_DATA(s),
2439 PyUnicode_1BYTE_DATA(s) + len,
2440 result);
2441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002443 default:
2444 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinner01698042011-10-04 00:04:26 +02002446 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 return NULL;
2448}
2449
2450static Py_UCS4*
2451as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2452 int copy_null)
2453{
2454 int kind;
2455 void *data;
2456 Py_ssize_t len, targetlen;
2457 if (PyUnicode_READY(string) == -1)
2458 return NULL;
2459 kind = PyUnicode_KIND(string);
2460 data = PyUnicode_DATA(string);
2461 len = PyUnicode_GET_LENGTH(string);
2462 targetlen = len;
2463 if (copy_null)
2464 targetlen++;
2465 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002466 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 if (!target) {
2468 PyErr_NoMemory();
2469 return NULL;
2470 }
2471 }
2472 else {
2473 if (targetsize < targetlen) {
2474 PyErr_Format(PyExc_SystemError,
2475 "string is longer than the buffer");
2476 if (copy_null && 0 < targetsize)
2477 target[0] = 0;
2478 return NULL;
2479 }
2480 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002481 if (kind == PyUnicode_1BYTE_KIND) {
2482 Py_UCS1 *start = (Py_UCS1 *) data;
2483 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002485 else if (kind == PyUnicode_2BYTE_KIND) {
2486 Py_UCS2 *start = (Py_UCS2 *) data;
2487 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2488 }
2489 else {
2490 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002491 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 if (copy_null)
2494 target[len] = 0;
2495 return target;
2496}
2497
2498Py_UCS4*
2499PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2500 int copy_null)
2501{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002502 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 PyErr_BadInternalCall();
2504 return NULL;
2505 }
2506 return as_ucs4(string, target, targetsize, copy_null);
2507}
2508
2509Py_UCS4*
2510PyUnicode_AsUCS4Copy(PyObject *string)
2511{
2512 return as_ucs4(string, NULL, 0, 1);
2513}
2514
Victor Stinner15a11362012-10-06 23:48:20 +02002515/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002516 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2517 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2518#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002519
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520static int
2521unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2522 Py_ssize_t width, Py_ssize_t precision)
2523{
2524 Py_ssize_t length, fill, arglen;
2525 Py_UCS4 maxchar;
2526
2527 if (PyUnicode_READY(str) == -1)
2528 return -1;
2529
2530 length = PyUnicode_GET_LENGTH(str);
2531 if ((precision == -1 || precision >= length)
2532 && width <= length)
2533 return _PyUnicodeWriter_WriteStr(writer, str);
2534
2535 if (precision != -1)
2536 length = Py_MIN(precision, length);
2537
2538 arglen = Py_MAX(length, width);
2539 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2540 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2541 else
2542 maxchar = writer->maxchar;
2543
2544 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2545 return -1;
2546
2547 if (width > length) {
2548 fill = width - length;
2549 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2550 return -1;
2551 writer->pos += fill;
2552 }
2553
2554 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2555 str, 0, length);
2556 writer->pos += length;
2557 return 0;
2558}
2559
2560static int
2561unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2562 Py_ssize_t width, Py_ssize_t precision)
2563{
2564 /* UTF-8 */
2565 Py_ssize_t length;
2566 PyObject *unicode;
2567 int res;
2568
2569 length = strlen(str);
2570 if (precision != -1)
2571 length = Py_MIN(length, precision);
2572 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2573 if (unicode == NULL)
2574 return -1;
2575
2576 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2577 Py_DECREF(unicode);
2578 return res;
2579}
2580
Victor Stinner96865452011-03-01 23:44:09 +00002581static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002582unicode_fromformat_arg(_PyUnicodeWriter *writer,
2583 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002584{
Victor Stinnere215d962012-10-06 23:03:36 +02002585 const char *p;
2586 Py_ssize_t len;
2587 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 Py_ssize_t width;
2589 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 int longflag;
2591 int longlongflag;
2592 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002594
2595 p = f;
2596 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002597 zeropad = 0;
2598 if (*f == '0') {
2599 zeropad = 1;
2600 f++;
2601 }
Victor Stinner96865452011-03-01 23:44:09 +00002602
2603 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 width = -1;
2605 if (Py_ISDIGIT((unsigned)*f)) {
2606 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002607 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002608 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002609 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002610 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002612 return NULL;
2613 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002615 f++;
2616 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 }
2618 precision = -1;
2619 if (*f == '.') {
2620 f++;
2621 if (Py_ISDIGIT((unsigned)*f)) {
2622 precision = (*f - '0');
2623 f++;
2624 while (Py_ISDIGIT((unsigned)*f)) {
2625 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2626 PyErr_SetString(PyExc_ValueError,
2627 "precision too big");
2628 return NULL;
2629 }
2630 precision = (precision * 10) + (*f - '0');
2631 f++;
2632 }
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == '%') {
2635 /* "%.3%s" => f points to "3" */
2636 f--;
2637 }
2638 }
2639 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002640 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002641 f--;
2642 }
Victor Stinner96865452011-03-01 23:44:09 +00002643
2644 /* Handle %ld, %lu, %lld and %llu. */
2645 longflag = 0;
2646 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002647 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002648 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002649 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002650 longflag = 1;
2651 ++f;
2652 }
Victor Stinner96865452011-03-01 23:44:09 +00002653 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002654 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002655 longlongflag = 1;
2656 f += 2;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658 }
2659 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002660 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002661 size_tflag = 1;
2662 ++f;
2663 }
Victor Stinnere215d962012-10-06 23:03:36 +02002664
2665 if (f[1] == '\0')
2666 writer->overallocate = 0;
2667
2668 switch (*f) {
2669 case 'c':
2670 {
2671 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002672 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002673 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 "character argument not in range(0x110000)");
2675 return NULL;
2676 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002677 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002678 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002679 break;
2680 }
2681
2682 case 'i':
2683 case 'd':
2684 case 'u':
2685 case 'x':
2686 {
2687 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002688 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002689 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002690
2691 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002695 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002696 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002697 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002699 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002700 va_arg(*vargs, size_t));
2701 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, unsigned int));
2704 }
2705 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002707 }
2708 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002709 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002712 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002713 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002714 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002715 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002716 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002717 va_arg(*vargs, Py_ssize_t));
2718 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002719 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002720 va_arg(*vargs, int));
2721 }
2722 assert(len >= 0);
2723
Victor Stinnere215d962012-10-06 23:03:36 +02002724 if (precision < len)
2725 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726
2727 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2729 return NULL;
2730
Victor Stinnere215d962012-10-06 23:03:36 +02002731 if (width > precision) {
2732 Py_UCS4 fillchar;
2733 fill = width - precision;
2734 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002735 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2736 return NULL;
2737 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002738 }
Victor Stinner15a11362012-10-06 23:48:20 +02002739 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002740 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2742 return NULL;
2743 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002744 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745
Victor Stinner4a587072013-11-19 12:54:53 +01002746 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2747 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002748 break;
2749 }
2750
2751 case 'p':
2752 {
2753 char number[MAX_LONG_LONG_CHARS];
2754
2755 len = sprintf(number, "%p", va_arg(*vargs, void*));
2756 assert(len >= 0);
2757
2758 /* %p is ill-defined: ensure leading 0x. */
2759 if (number[1] == 'X')
2760 number[1] = 'x';
2761 else if (number[1] != 'x') {
2762 memmove(number + 2, number,
2763 strlen(number) + 1);
2764 number[0] = '0';
2765 number[1] = 'x';
2766 len += 2;
2767 }
2768
Victor Stinner4a587072013-11-19 12:54:53 +01002769 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
2771 break;
2772 }
2773
2774 case 's':
2775 {
2776 /* UTF-8 */
2777 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002778 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002779 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002780 break;
2781 }
2782
2783 case 'U':
2784 {
2785 PyObject *obj = va_arg(*vargs, PyObject *);
2786 assert(obj && _PyUnicode_CHECK(obj));
2787
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002788 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002789 return NULL;
2790 break;
2791 }
2792
2793 case 'V':
2794 {
2795 PyObject *obj = va_arg(*vargs, PyObject *);
2796 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002797 if (obj) {
2798 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002799 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002800 return NULL;
2801 }
2802 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 assert(str != NULL);
2804 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002805 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002806 }
2807 break;
2808 }
2809
2810 case 'S':
2811 {
2812 PyObject *obj = va_arg(*vargs, PyObject *);
2813 PyObject *str;
2814 assert(obj);
2815 str = PyObject_Str(obj);
2816 if (!str)
2817 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002818 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002819 Py_DECREF(str);
2820 return NULL;
2821 }
2822 Py_DECREF(str);
2823 break;
2824 }
2825
2826 case 'R':
2827 {
2828 PyObject *obj = va_arg(*vargs, PyObject *);
2829 PyObject *repr;
2830 assert(obj);
2831 repr = PyObject_Repr(obj);
2832 if (!repr)
2833 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002834 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002835 Py_DECREF(repr);
2836 return NULL;
2837 }
2838 Py_DECREF(repr);
2839 break;
2840 }
2841
2842 case 'A':
2843 {
2844 PyObject *obj = va_arg(*vargs, PyObject *);
2845 PyObject *ascii;
2846 assert(obj);
2847 ascii = PyObject_ASCII(obj);
2848 if (!ascii)
2849 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002850 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002851 Py_DECREF(ascii);
2852 return NULL;
2853 }
2854 Py_DECREF(ascii);
2855 break;
2856 }
2857
2858 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002859 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002860 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002861 break;
2862
2863 default:
2864 /* if we stumble upon an unknown formatting code, copy the rest
2865 of the format string to the output string. (we cannot just
2866 skip the code, since there's no way to know what's in the
2867 argument list) */
2868 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002869 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002870 return NULL;
2871 f = p+len;
2872 return f;
2873 }
2874
2875 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002876 return f;
2877}
2878
Walter Dörwaldd2034312007-05-18 16:29:38 +00002879PyObject *
2880PyUnicode_FromFormatV(const char *format, va_list vargs)
2881{
Victor Stinnere215d962012-10-06 23:03:36 +02002882 va_list vargs2;
2883 const char *f;
2884 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002885
Victor Stinner8f674cc2013-04-17 23:02:17 +02002886 _PyUnicodeWriter_Init(&writer);
2887 writer.min_length = strlen(format) + 100;
2888 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002889
Benjamin Peterson0c212142016-09-20 20:39:33 -07002890 // Copy varags to be able to pass a reference to a subfunction.
2891 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002892
2893 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002894 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002895 f = unicode_fromformat_arg(&writer, f, &vargs2);
2896 if (f == NULL)
2897 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002900 const char *p;
2901 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002902
Victor Stinnere215d962012-10-06 23:03:36 +02002903 p = f;
2904 do
2905 {
2906 if ((unsigned char)*p > 127) {
2907 PyErr_Format(PyExc_ValueError,
2908 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2909 "string, got a non-ASCII byte: 0x%02x",
2910 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002911 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002912 }
2913 p++;
2914 }
2915 while (*p != '\0' && *p != '%');
2916 len = p - f;
2917
2918 if (*p == '\0')
2919 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002920
2921 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002922 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002923
2924 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002926 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002927 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002928 return _PyUnicodeWriter_Finish(&writer);
2929
2930 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002931 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002932 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002933 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002934}
2935
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936PyObject *
2937PyUnicode_FromFormat(const char *format, ...)
2938{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 PyObject* ret;
2940 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941
2942#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002944#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 ret = PyUnicode_FromFormatV(format, vargs);
2948 va_end(vargs);
2949 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002950}
2951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952#ifdef HAVE_WCHAR_H
2953
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002954/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002955
Victor Stinnerd88d9832011-09-06 02:00:05 +02002956 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002957 character) required to convert the unicode object. Ignore size argument.
2958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002961 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002962Py_ssize_t
2963PyUnicode_AsWideChar(PyObject *unicode,
2964 wchar_t *w,
2965 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002966{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002968 const wchar_t *wstr;
2969
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002970 if (unicode == NULL) {
2971 PyErr_BadInternalCall();
2972 return -1;
2973 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002974 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 if (wstr == NULL)
2976 return -1;
2977
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002979 if (size > res)
2980 size = res + 1;
2981 else
2982 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002983 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002984 return res;
2985 }
2986 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002987 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002994 const wchar_t *wstr;
2995 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002996 Py_ssize_t buflen;
2997
2998 if (unicode == NULL) {
2999 PyErr_BadInternalCall();
3000 return NULL;
3001 }
3002
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003003 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3004 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003006 }
3007 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3008 PyErr_SetString(PyExc_ValueError,
3009 "embedded null character");
3010 return NULL;
3011 }
3012
3013 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003014 if (buffer == NULL) {
3015 PyErr_NoMemory();
3016 return NULL;
3017 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003018 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003019 if (size != NULL)
3020 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003021 return buffer;
3022}
3023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025
Alexander Belopolsky40018472011-02-26 01:02:56 +00003026PyObject *
3027PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003028{
Victor Stinner8faf8212011-12-08 22:14:11 +01003029 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 PyErr_SetString(PyExc_ValueError,
3031 "chr() arg not in range(0x110000)");
3032 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003033 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003034
Victor Stinner985a82a2014-01-03 12:53:47 +01003035 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003036}
3037
Alexander Belopolsky40018472011-02-26 01:02:56 +00003038PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003039PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003041 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003044 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003045 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 Py_INCREF(obj);
3047 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 }
3049 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 /* For a Unicode subtype that's not a Unicode object,
3051 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003052 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003053 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003054 PyErr_Format(PyExc_TypeError,
3055 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003056 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003057 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003058}
3059
Alexander Belopolsky40018472011-02-26 01:02:56 +00003060PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003061PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003062 const char *encoding,
3063 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003064{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003065 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003066 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003067
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 PyErr_BadInternalCall();
3070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003072
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003073 /* Decoding bytes objects is the most common case and should be fast */
3074 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003075 if (PyBytes_GET_SIZE(obj) == 0)
3076 _Py_RETURN_UNICODE_EMPTY();
3077 v = PyUnicode_Decode(
3078 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3079 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 return v;
3081 }
3082
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003083 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 PyErr_SetString(PyExc_TypeError,
3085 "decoding str is not supported");
3086 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003088
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3090 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3091 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003092 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 Py_TYPE(obj)->tp_name);
3094 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003095 }
Tim Petersced69f82003-09-16 20:30:58 +00003096
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003097 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 PyBuffer_Release(&buffer);
3099 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003101
Serhiy Storchaka05997252013-01-26 12:14:02 +02003102 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003103 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003104 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105}
3106
Victor Stinnerebe17e02016-10-12 13:57:45 +02003107/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3108 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3109 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003110int
3111_Py_normalize_encoding(const char *encoding,
3112 char *lower,
3113 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003115 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003116 char *l;
3117 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003118 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119
Victor Stinner942889a2016-09-05 15:40:10 -07003120 assert(encoding != NULL);
3121
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 e = encoding;
3123 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003124 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003125 punct = 0;
3126 while (1) {
3127 char c = *e;
3128 if (c == 0) {
3129 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003130 }
Victor Stinner942889a2016-09-05 15:40:10 -07003131
3132 if (Py_ISALNUM(c) || c == '.') {
3133 if (punct && l != lower) {
3134 if (l == l_end) {
3135 return 0;
3136 }
3137 *l++ = '_';
3138 }
3139 punct = 0;
3140
3141 if (l == l_end) {
3142 return 0;
3143 }
3144 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003145 }
3146 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003147 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003148 }
Victor Stinner942889a2016-09-05 15:40:10 -07003149
3150 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003151 }
3152 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003153 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003154}
3155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156PyObject *
3157PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003158 Py_ssize_t size,
3159 const char *encoding,
3160 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003161{
3162 PyObject *buffer = NULL, *unicode;
3163 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003164 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3165
3166 if (encoding == NULL) {
3167 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3168 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003169
Fred Drakee4315f52000-05-09 19:53:39 +00003170 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003171 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3172 char *lower = buflower;
3173
3174 /* Fast paths */
3175 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3176 lower += 3;
3177 if (*lower == '_') {
3178 /* Match "utf8" and "utf_8" */
3179 lower++;
3180 }
3181
3182 if (lower[0] == '8' && lower[1] == 0) {
3183 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3184 }
3185 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3186 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3187 }
3188 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3189 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3190 }
3191 }
3192 else {
3193 if (strcmp(lower, "ascii") == 0
3194 || strcmp(lower, "us_ascii") == 0) {
3195 return PyUnicode_DecodeASCII(s, size, errors);
3196 }
Steve Dowercc16be82016-09-08 10:35:16 -07003197 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003198 else if (strcmp(lower, "mbcs") == 0) {
3199 return PyUnicode_DecodeMBCS(s, size, errors);
3200 }
3201 #endif
3202 else if (strcmp(lower, "latin1") == 0
3203 || strcmp(lower, "latin_1") == 0
3204 || strcmp(lower, "iso_8859_1") == 0
3205 || strcmp(lower, "iso8859_1") == 0) {
3206 return PyUnicode_DecodeLatin1(s, size, errors);
3207 }
3208 }
Victor Stinner37296e82010-06-10 13:36:23 +00003209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210
3211 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003212 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003213 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003214 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003215 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 if (buffer == NULL)
3217 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003218 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 if (unicode == NULL)
3220 goto onError;
3221 if (!PyUnicode_Check(unicode)) {
3222 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003223 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3224 "use codecs.decode() to decode to arbitrary types",
3225 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003226 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 Py_DECREF(unicode);
3228 goto onError;
3229 }
3230 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003231 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 Py_XDECREF(buffer);
3235 return NULL;
3236}
3237
Alexander Belopolsky40018472011-02-26 01:02:56 +00003238PyObject *
3239PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003240 const char *encoding,
3241 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003243 if (!PyUnicode_Check(unicode)) {
3244 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003245 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246 }
3247
Serhiy Storchaka00939072016-10-27 21:05:49 +03003248 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3249 "PyUnicode_AsDecodedObject() is deprecated; "
3250 "use PyCodec_Decode() to decode from str", 1) < 0)
3251 return NULL;
3252
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255
3256 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003257 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258}
3259
Alexander Belopolsky40018472011-02-26 01:02:56 +00003260PyObject *
3261PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003262 const char *encoding,
3263 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003264{
3265 PyObject *v;
3266
3267 if (!PyUnicode_Check(unicode)) {
3268 PyErr_BadArgument();
3269 goto onError;
3270 }
3271
Serhiy Storchaka00939072016-10-27 21:05:49 +03003272 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3273 "PyUnicode_AsDecodedUnicode() is deprecated; "
3274 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3275 return NULL;
3276
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003277 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279
3280 /* Decode via the codec registry */
3281 v = PyCodec_Decode(unicode, encoding, errors);
3282 if (v == NULL)
3283 goto onError;
3284 if (!PyUnicode_Check(v)) {
3285 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003286 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3287 "use codecs.decode() to decode to arbitrary types",
3288 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003289 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290 Py_DECREF(v);
3291 goto onError;
3292 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003293 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003294
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003296 return NULL;
3297}
3298
Alexander Belopolsky40018472011-02-26 01:02:56 +00003299PyObject *
3300PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003301 Py_ssize_t size,
3302 const char *encoding,
3303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304{
3305 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003306
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003307 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3311 Py_DECREF(unicode);
3312 return v;
3313}
3314
Alexander Belopolsky40018472011-02-26 01:02:56 +00003315PyObject *
3316PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003317 const char *encoding,
3318 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003319{
3320 PyObject *v;
3321
3322 if (!PyUnicode_Check(unicode)) {
3323 PyErr_BadArgument();
3324 goto onError;
3325 }
3326
Serhiy Storchaka00939072016-10-27 21:05:49 +03003327 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3328 "PyUnicode_AsEncodedObject() is deprecated; "
3329 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3330 "or PyCodec_Encode() for generic encoding", 1) < 0)
3331 return NULL;
3332
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003333 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335
3336 /* Encode via the codec registry */
3337 v = PyCodec_Encode(unicode, encoding, errors);
3338 if (v == NULL)
3339 goto onError;
3340 return v;
3341
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003343 return NULL;
3344}
3345
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003346static size_t
3347wcstombs_errorpos(const wchar_t *wstr)
3348{
3349 size_t len;
3350#if SIZEOF_WCHAR_T == 2
3351 wchar_t buf[3];
3352#else
3353 wchar_t buf[2];
3354#endif
3355 char outbuf[MB_LEN_MAX];
3356 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003357
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003358#if SIZEOF_WCHAR_T == 2
3359 buf[2] = 0;
3360#else
3361 buf[1] = 0;
3362#endif
3363 start = wstr;
3364 while (*wstr != L'\0')
3365 {
3366 previous = wstr;
3367#if SIZEOF_WCHAR_T == 2
3368 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3369 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3370 {
3371 buf[0] = wstr[0];
3372 buf[1] = wstr[1];
3373 wstr += 2;
3374 }
3375 else {
3376 buf[0] = *wstr;
3377 buf[1] = 0;
3378 wstr++;
3379 }
3380#else
3381 buf[0] = *wstr;
3382 wstr++;
3383#endif
3384 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003385 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387 }
3388
3389 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003390 return 0;
3391}
3392
Victor Stinner1b579672011-12-17 05:47:23 +01003393static int
3394locale_error_handler(const char *errors, int *surrogateescape)
3395{
Victor Stinner50149202015-09-22 00:26:54 +02003396 _Py_error_handler error_handler = get_error_handler(errors);
3397 switch (error_handler)
3398 {
3399 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003400 *surrogateescape = 0;
3401 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003402 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003403 *surrogateescape = 1;
3404 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003405 default:
3406 PyErr_Format(PyExc_ValueError,
3407 "only 'strict' and 'surrogateescape' error handlers "
3408 "are supported, not '%s'",
3409 errors);
3410 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003411 }
Victor Stinner1b579672011-12-17 05:47:23 +01003412}
3413
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003415PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416{
3417 Py_ssize_t wlen, wlen2;
3418 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003420 PyObject *bytes, *reason, *exc;
3421 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003422 int surrogateescape;
3423
3424 if (locale_error_handler(errors, &surrogateescape) < 0)
3425 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426
3427 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3428 if (wstr == NULL)
3429 return NULL;
3430
3431 wlen2 = wcslen(wstr);
3432 if (wlen2 != wlen) {
3433 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003434 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003435 return NULL;
3436 }
3437
3438 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003439 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003440 char *str;
3441
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003442 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003443 if (str == NULL) {
3444 if (error_pos == (size_t)-1) {
3445 PyErr_NoMemory();
3446 PyMem_Free(wstr);
3447 return NULL;
3448 }
3449 else {
3450 goto encode_error;
3451 }
3452 }
3453 PyMem_Free(wstr);
3454
3455 bytes = PyBytes_FromString(str);
3456 PyMem_Free(str);
3457 }
3458 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003459 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003460 size_t len, len2;
3461
3462 len = wcstombs(NULL, wstr, 0);
3463 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003464 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003465 goto encode_error;
3466 }
3467
3468 bytes = PyBytes_FromStringAndSize(NULL, len);
3469 if (bytes == NULL) {
3470 PyMem_Free(wstr);
3471 return NULL;
3472 }
3473
3474 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3475 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003476 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003477 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003478 goto encode_error;
3479 }
3480 PyMem_Free(wstr);
3481 }
3482 return bytes;
3483
3484encode_error:
3485 errmsg = strerror(errno);
3486 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003487
3488 if (error_pos == (size_t)-1)
3489 error_pos = wcstombs_errorpos(wstr);
3490
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003493 wstr = Py_DecodeLocale(errmsg, &errlen);
3494 if (wstr != NULL) {
3495 reason = PyUnicode_FromWideChar(wstr, errlen);
3496 PyMem_RawFree(wstr);
3497 } else {
3498 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003499 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500
Victor Stinner2f197072011-12-17 07:08:30 +01003501 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003502 reason = PyUnicode_FromString(
3503 "wcstombs() encountered an unencodable "
3504 "wide character");
3505 if (reason == NULL)
3506 return NULL;
3507
3508 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3509 "locale", unicode,
3510 (Py_ssize_t)error_pos,
3511 (Py_ssize_t)(error_pos+1),
3512 reason);
3513 Py_DECREF(reason);
3514 if (exc != NULL) {
3515 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003516 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003517 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003518 return NULL;
3519}
3520
Victor Stinnerad158722010-10-27 00:25:46 +00003521PyObject *
3522PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003523{
Steve Dowercc16be82016-09-08 10:35:16 -07003524#if defined(__APPLE__)
3525 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003526#else
Victor Stinner793b5312011-04-27 00:24:21 +02003527 PyInterpreterState *interp = PyThreadState_GET()->interp;
3528 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3529 cannot use it to encode and decode filenames before it is loaded. Load
3530 the Python codec requires to encode at least its own filename. Use the C
3531 version of the locale codec until the codec registry is initialized and
3532 the Python codec is loaded.
3533
3534 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3535 cannot only rely on it: check also interp->fscodec_initialized for
3536 subinterpreters. */
3537 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003538 return PyUnicode_AsEncodedString(unicode,
3539 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003540 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003541 }
3542 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003543 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003544 }
Victor Stinnerad158722010-10-27 00:25:46 +00003545#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003546}
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548PyObject *
3549PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003550 const char *encoding,
3551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552{
3553 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003554 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003555
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 if (!PyUnicode_Check(unicode)) {
3557 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 }
Fred Drakee4315f52000-05-09 19:53:39 +00003560
Victor Stinner942889a2016-09-05 15:40:10 -07003561 if (encoding == NULL) {
3562 return _PyUnicode_AsUTF8String(unicode, errors);
3563 }
3564
Fred Drakee4315f52000-05-09 19:53:39 +00003565 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003566 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3567 char *lower = buflower;
3568
3569 /* Fast paths */
3570 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3571 lower += 3;
3572 if (*lower == '_') {
3573 /* Match "utf8" and "utf_8" */
3574 lower++;
3575 }
3576
3577 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003578 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003579 }
3580 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3581 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3582 }
3583 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3584 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3585 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003586 }
Victor Stinner942889a2016-09-05 15:40:10 -07003587 else {
3588 if (strcmp(lower, "ascii") == 0
3589 || strcmp(lower, "us_ascii") == 0) {
3590 return _PyUnicode_AsASCIIString(unicode, errors);
3591 }
Steve Dowercc16be82016-09-08 10:35:16 -07003592#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003593 else if (strcmp(lower, "mbcs") == 0) {
3594 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3595 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003596#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003597 else if (strcmp(lower, "latin1") == 0 ||
3598 strcmp(lower, "latin_1") == 0 ||
3599 strcmp(lower, "iso_8859_1") == 0 ||
3600 strcmp(lower, "iso8859_1") == 0) {
3601 return _PyUnicode_AsLatin1String(unicode, errors);
3602 }
3603 }
Victor Stinner37296e82010-06-10 13:36:23 +00003604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605
3606 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003607 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003609 return NULL;
3610
3611 /* The normal path */
3612 if (PyBytes_Check(v))
3613 return v;
3614
3615 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003616 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003617 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003618 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003619
3620 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003621 "encoder %s returned bytearray instead of bytes; "
3622 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003623 encoding);
3624 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 Py_DECREF(v);
3626 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003628
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003629 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3630 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003631 Py_DECREF(v);
3632 return b;
3633 }
3634
3635 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003636 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3637 "use codecs.encode() to encode to arbitrary types",
3638 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003639 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003641 return NULL;
3642}
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644PyObject *
3645PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 const char *encoding,
3647 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003648{
3649 PyObject *v;
3650
3651 if (!PyUnicode_Check(unicode)) {
3652 PyErr_BadArgument();
3653 goto onError;
3654 }
3655
Serhiy Storchaka00939072016-10-27 21:05:49 +03003656 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3657 "PyUnicode_AsEncodedUnicode() is deprecated; "
3658 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3659 return NULL;
3660
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003663
3664 /* Encode via the codec registry */
3665 v = PyCodec_Encode(unicode, encoding, errors);
3666 if (v == NULL)
3667 goto onError;
3668 if (!PyUnicode_Check(v)) {
3669 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003670 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3671 "use codecs.encode() to encode to arbitrary types",
3672 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003673 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003674 Py_DECREF(v);
3675 goto onError;
3676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 return NULL;
3681}
3682
Victor Stinner2f197072011-12-17 07:08:30 +01003683static size_t
3684mbstowcs_errorpos(const char *str, size_t len)
3685{
3686#ifdef HAVE_MBRTOWC
3687 const char *start = str;
3688 mbstate_t mbs;
3689 size_t converted;
3690 wchar_t ch;
3691
3692 memset(&mbs, 0, sizeof mbs);
3693 while (len)
3694 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003695 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003696 if (converted == 0)
3697 /* Reached end of string */
3698 break;
3699 if (converted == (size_t)-1 || converted == (size_t)-2) {
3700 /* Conversion error or incomplete character */
3701 return str - start;
3702 }
3703 else {
3704 str += converted;
3705 len -= converted;
3706 }
3707 }
3708 /* failed to find the undecodable byte sequence */
3709 return 0;
3710#endif
3711 return 0;
3712}
3713
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003714PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003716 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
3718 wchar_t smallbuf[256];
3719 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3720 wchar_t *wstr;
3721 size_t wlen, wlen2;
3722 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003723 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003724 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003725 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003726 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003727
3728 if (locale_error_handler(errors, &surrogateescape) < 0)
3729 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003730
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003731 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3732 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003733 return NULL;
3734 }
3735
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003736 if (surrogateescape) {
3737 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003738 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003739 if (wstr == NULL) {
3740 if (wlen == (size_t)-1)
3741 PyErr_NoMemory();
3742 else
3743 PyErr_SetFromErrno(PyExc_OSError);
3744 return NULL;
3745 }
3746
3747 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003748 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003749 }
3750 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003751 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003752#ifndef HAVE_BROKEN_MBSTOWCS
3753 wlen = mbstowcs(NULL, str, 0);
3754#else
3755 wlen = len;
3756#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003757 if (wlen == (size_t)-1)
3758 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003759 if (wlen+1 <= smallbuf_len) {
3760 wstr = smallbuf;
3761 }
3762 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003763 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003764 if (!wstr)
3765 return PyErr_NoMemory();
3766 }
3767
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003768 wlen2 = mbstowcs(wstr, str, wlen+1);
3769 if (wlen2 == (size_t)-1) {
3770 if (wstr != smallbuf)
3771 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003772 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003773 }
3774#ifdef HAVE_BROKEN_MBSTOWCS
3775 assert(wlen2 == wlen);
3776#endif
3777 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3778 if (wstr != smallbuf)
3779 PyMem_Free(wstr);
3780 }
3781 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003782
3783decode_error:
3784 errmsg = strerror(errno);
3785 assert(errmsg != NULL);
3786
3787 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003788 wstr = Py_DecodeLocale(errmsg, &errlen);
3789 if (wstr != NULL) {
3790 reason = PyUnicode_FromWideChar(wstr, errlen);
3791 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003792 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003793
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003794 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003795 reason = PyUnicode_FromString(
3796 "mbstowcs() encountered an invalid multibyte sequence");
3797 if (reason == NULL)
3798 return NULL;
3799
3800 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3801 "locale", str, len,
3802 (Py_ssize_t)error_pos,
3803 (Py_ssize_t)(error_pos+1),
3804 reason);
3805 Py_DECREF(reason);
3806 if (exc != NULL) {
3807 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003808 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003809 }
3810 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003811}
3812
3813PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003814PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003815{
3816 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003817 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003818}
3819
3820
3821PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003822PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003823 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003824 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3825}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003826
Christian Heimes5894ba72007-11-04 11:43:14 +00003827PyObject*
3828PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3829{
Steve Dowercc16be82016-09-08 10:35:16 -07003830#if defined(__APPLE__)
3831 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003832#else
Victor Stinner793b5312011-04-27 00:24:21 +02003833 PyInterpreterState *interp = PyThreadState_GET()->interp;
3834 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3835 cannot use it to encode and decode filenames before it is loaded. Load
3836 the Python codec requires to encode at least its own filename. Use the C
3837 version of the locale codec until the codec registry is initialized and
3838 the Python codec is loaded.
3839
3840 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3841 cannot only rely on it: check also interp->fscodec_initialized for
3842 subinterpreters. */
3843 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003844 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003846 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 }
3848 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003849 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003850 }
Victor Stinnerad158722010-10-27 00:25:46 +00003851#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852}
3853
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854
3855int
3856PyUnicode_FSConverter(PyObject* arg, void* addr)
3857{
Brett Cannonec6ce872016-09-06 15:50:29 -07003858 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003859 PyObject *output = NULL;
3860 Py_ssize_t size;
3861 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003862 if (arg == NULL) {
3863 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003864 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003865 return 1;
3866 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003867 path = PyOS_FSPath(arg);
3868 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003869 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003870 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003871 if (PyBytes_Check(path)) {
3872 output = path;
3873 }
3874 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3875 output = PyUnicode_EncodeFSDefault(path);
3876 Py_DECREF(path);
3877 if (!output) {
3878 return 0;
3879 }
3880 assert(PyBytes_Check(output));
3881 }
3882
Victor Stinner0ea2a462010-04-30 00:22:08 +00003883 size = PyBytes_GET_SIZE(output);
3884 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003885 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003886 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887 Py_DECREF(output);
3888 return 0;
3889 }
3890 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003892}
3893
3894
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003895int
3896PyUnicode_FSDecoder(PyObject* arg, void* addr)
3897{
Brett Cannona5711202016-09-06 19:36:01 -07003898 int is_buffer = 0;
3899 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003900 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003901 if (arg == NULL) {
3902 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003903 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003904 return 1;
3905 }
Brett Cannona5711202016-09-06 19:36:01 -07003906
3907 is_buffer = PyObject_CheckBuffer(arg);
3908 if (!is_buffer) {
3909 path = PyOS_FSPath(arg);
3910 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003911 return 0;
3912 }
Brett Cannona5711202016-09-06 19:36:01 -07003913 }
3914 else {
3915 path = arg;
3916 Py_INCREF(arg);
3917 }
3918
3919 if (PyUnicode_Check(path)) {
3920 if (PyUnicode_READY(path) == -1) {
3921 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003922 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003923 }
3924 output = path;
3925 }
3926 else if (PyBytes_Check(path) || is_buffer) {
3927 PyObject *path_bytes = NULL;
3928
3929 if (!PyBytes_Check(path) &&
3930 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3931 "path should be string, bytes, or os.PathLike, not %.200s",
3932 Py_TYPE(arg)->tp_name)) {
3933 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003934 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003935 }
3936 path_bytes = PyBytes_FromObject(path);
3937 Py_DECREF(path);
3938 if (!path_bytes) {
3939 return 0;
3940 }
3941 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3942 PyBytes_GET_SIZE(path_bytes));
3943 Py_DECREF(path_bytes);
3944 if (!output) {
3945 return 0;
3946 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003947 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003948 else {
3949 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003950 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003951 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003952 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 return 0;
3954 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003955 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003956 Py_DECREF(output);
3957 return 0;
3958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003960 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003961 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003962 Py_DECREF(output);
3963 return 0;
3964 }
3965 *(PyObject**)addr = output;
3966 return Py_CLEANUP_SUPPORTED;
3967}
3968
3969
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003970const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003972{
Christian Heimesf3863112007-11-22 07:46:41 +00003973 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003975 if (!PyUnicode_Check(unicode)) {
3976 PyErr_BadArgument();
3977 return NULL;
3978 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003979 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003982 if (PyUnicode_UTF8(unicode) == NULL) {
3983 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003984 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 if (bytes == NULL)
3986 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003987 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3988 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003989 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 Py_DECREF(bytes);
3991 return NULL;
3992 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003993 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003994 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 PyBytes_AS_STRING(bytes),
3996 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997 Py_DECREF(bytes);
3998 }
3999
4000 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004001 *psize = PyUnicode_UTF8_LENGTH(unicode);
4002 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004003}
4004
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004005const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4009}
4010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011Py_UNICODE *
4012PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 const unsigned char *one_byte;
4015#if SIZEOF_WCHAR_T == 4
4016 const Py_UCS2 *two_bytes;
4017#else
4018 const Py_UCS4 *four_bytes;
4019 const Py_UCS4 *ucs4_end;
4020 Py_ssize_t num_surrogates;
4021#endif
4022 wchar_t *w;
4023 wchar_t *wchar_end;
4024
4025 if (!PyUnicode_Check(unicode)) {
4026 PyErr_BadArgument();
4027 return NULL;
4028 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004029 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 assert(_PyUnicode_KIND(unicode) != 0);
4032 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4037 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 num_surrogates = 0;
4039
4040 for (; four_bytes < ucs4_end; ++four_bytes) {
4041 if (*four_bytes > 0xFFFF)
4042 ++num_surrogates;
4043 }
4044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004045 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4046 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4047 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 PyErr_NoMemory();
4049 return NULL;
4050 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004051 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 w = _PyUnicode_WSTR(unicode);
4054 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4055 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4057 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004058 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004060 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4061 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 }
4063 else
4064 *w = *four_bytes;
4065
4066 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07004067 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 }
4069 }
4070 *w = 0;
4071#else
4072 /* sizeof(wchar_t) == 4 */
4073 Py_FatalError("Impossible unicode object state, wstr and str "
4074 "should share memory already.");
4075 return NULL;
4076#endif
4077 }
4078 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004079 if ((size_t)_PyUnicode_LENGTH(unicode) >
4080 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4081 PyErr_NoMemory();
4082 return NULL;
4083 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004084 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4085 (_PyUnicode_LENGTH(unicode) + 1));
4086 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087 PyErr_NoMemory();
4088 return NULL;
4089 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004090 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4091 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4092 w = _PyUnicode_WSTR(unicode);
4093 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004095 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4096 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 for (; w < wchar_end; ++one_byte, ++w)
4098 *w = *one_byte;
4099 /* null-terminate the wstr */
4100 *w = 0;
4101 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004102 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 for (; w < wchar_end; ++two_bytes, ++w)
4106 *w = *two_bytes;
4107 /* null-terminate the wstr */
4108 *w = 0;
4109#else
4110 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004111 PyObject_FREE(_PyUnicode_WSTR(unicode));
4112 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 Py_FatalError("Impossible unicode object state, wstr "
4114 "and str should share memory already.");
4115 return NULL;
4116#endif
4117 }
4118 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07004119 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 }
4121 }
4122 }
4123 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004124 *size = PyUnicode_WSTR_LENGTH(unicode);
4125 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004126}
4127
Alexander Belopolsky40018472011-02-26 01:02:56 +00004128Py_UNICODE *
4129PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132}
4133
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004134const Py_UNICODE *
4135_PyUnicode_AsUnicode(PyObject *unicode)
4136{
4137 Py_ssize_t size;
4138 const Py_UNICODE *wstr;
4139
4140 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4141 if (wstr && wcslen(wstr) != (size_t)size) {
4142 PyErr_SetString(PyExc_ValueError, "embedded null character");
4143 return NULL;
4144 }
4145 return wstr;
4146}
4147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148
Alexander Belopolsky40018472011-02-26 01:02:56 +00004149Py_ssize_t
4150PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151{
4152 if (!PyUnicode_Check(unicode)) {
4153 PyErr_BadArgument();
4154 goto onError;
4155 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004156 if (_PyUnicode_WSTR(unicode) == NULL) {
4157 if (PyUnicode_AsUnicode(unicode) == NULL)
4158 goto onError;
4159 }
4160 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 return -1;
4164}
4165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166Py_ssize_t
4167PyUnicode_GetLength(PyObject *unicode)
4168{
Victor Stinner07621332012-06-16 04:53:46 +02004169 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 PyErr_BadArgument();
4171 return -1;
4172 }
Victor Stinner07621332012-06-16 04:53:46 +02004173 if (PyUnicode_READY(unicode) == -1)
4174 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004175 return PyUnicode_GET_LENGTH(unicode);
4176}
4177
4178Py_UCS4
4179PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4180{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004181 void *data;
4182 int kind;
4183
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004184 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004185 PyErr_BadArgument();
4186 return (Py_UCS4)-1;
4187 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004188 if (PyUnicode_READY(unicode) == -1) {
4189 return (Py_UCS4)-1;
4190 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004191 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004192 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 return (Py_UCS4)-1;
4194 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004195 data = PyUnicode_DATA(unicode);
4196 kind = PyUnicode_KIND(unicode);
4197 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198}
4199
4200int
4201PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4202{
4203 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004204 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 return -1;
4206 }
Victor Stinner488fa492011-12-12 00:01:39 +01004207 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004208 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004209 PyErr_SetString(PyExc_IndexError, "string index out of range");
4210 return -1;
4211 }
Victor Stinner488fa492011-12-12 00:01:39 +01004212 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004213 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004214 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4215 PyErr_SetString(PyExc_ValueError, "character out of range");
4216 return -1;
4217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4219 index, ch);
4220 return 0;
4221}
4222
Alexander Belopolsky40018472011-02-26 01:02:56 +00004223const char *
4224PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004225{
Victor Stinner42cb4622010-09-01 19:39:01 +00004226 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004227}
4228
Victor Stinner554f3f02010-06-16 23:33:54 +00004229/* create or adjust a UnicodeDecodeError */
4230static void
4231make_decode_exception(PyObject **exceptionObject,
4232 const char *encoding,
4233 const char *input, Py_ssize_t length,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
4235 const char *reason)
4236{
4237 if (*exceptionObject == NULL) {
4238 *exceptionObject = PyUnicodeDecodeError_Create(
4239 encoding, input, length, startpos, endpos, reason);
4240 }
4241 else {
4242 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4243 goto onError;
4244 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4245 goto onError;
4246 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4247 goto onError;
4248 }
4249 return;
4250
4251onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004252 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004253}
4254
Steve Dowercc16be82016-09-08 10:35:16 -07004255#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256/* error handling callback helper:
4257 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004258 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 and adjust various state variables.
4260 return 0 on success, -1 on error
4261*/
4262
Alexander Belopolsky40018472011-02-26 01:02:56 +00004263static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264unicode_decode_call_errorhandler_wchar(
4265 const char *errors, PyObject **errorHandler,
4266 const char *encoding, const char *reason,
4267 const char **input, const char **inend, Py_ssize_t *startinpos,
4268 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004271 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272
4273 PyObject *restuple = NULL;
4274 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004275 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004276 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 Py_ssize_t requiredsize;
4278 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004279 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 wchar_t *repwstr;
4281 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4284 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 *errorHandler = PyCodec_LookupError(errors);
4288 if (*errorHandler == NULL)
4289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 }
4291
Victor Stinner554f3f02010-06-16 23:33:54 +00004292 make_decode_exception(exceptionObject,
4293 encoding,
4294 *input, *inend - *input,
4295 *startinpos, *endinpos,
4296 reason);
4297 if (*exceptionObject == NULL)
4298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004300 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004304 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004307 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309
4310 /* Copy back the bytes variables, which might have been modified by the
4311 callback */
4312 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313 if (!inputobj)
4314 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 *input = PyBytes_AS_STRING(inputobj);
4316 insize = PyBytes_GET_SIZE(inputobj);
4317 *inend = *input + insize;
4318 /* we can DECREF safely, as the exception has another reference,
4319 so the object won't go away. */
4320 Py_DECREF(inputobj);
4321
4322 if (newpos<0)
4323 newpos = insize+newpos;
4324 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004325 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 goto onError;
4327 }
4328
4329 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4330 if (repwstr == NULL)
4331 goto onError;
4332 /* need more space? (at least enough for what we
4333 have+the replacement+the rest of the string (starting
4334 at the new input position), so we won't have to check space
4335 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004336 requiredsize = *outpos;
4337 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4338 goto overflow;
4339 requiredsize += repwlen;
4340 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4341 goto overflow;
4342 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004344 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 requiredsize = 2*outsize;
4346 if (unicode_resize(output, requiredsize) < 0)
4347 goto onError;
4348 }
4349 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4350 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 *endinpos = newpos;
4352 *inptr = *input + newpos;
4353
4354 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004355 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 return 0;
4357
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004358 overflow:
4359 PyErr_SetString(PyExc_OverflowError,
4360 "decoded result is too long for a Python string");
4361
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 onError:
4363 Py_XDECREF(restuple);
4364 return -1;
4365}
Steve Dowercc16be82016-09-08 10:35:16 -07004366#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367
4368static int
4369unicode_decode_call_errorhandler_writer(
4370 const char *errors, PyObject **errorHandler,
4371 const char *encoding, const char *reason,
4372 const char **input, const char **inend, Py_ssize_t *startinpos,
4373 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4374 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4375{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004376 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377
4378 PyObject *restuple = NULL;
4379 PyObject *repunicode = NULL;
4380 Py_ssize_t insize;
4381 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004382 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004383 PyObject *inputobj = NULL;
4384
4385 if (*errorHandler == NULL) {
4386 *errorHandler = PyCodec_LookupError(errors);
4387 if (*errorHandler == NULL)
4388 goto onError;
4389 }
4390
4391 make_decode_exception(exceptionObject,
4392 encoding,
4393 *input, *inend - *input,
4394 *startinpos, *endinpos,
4395 reason);
4396 if (*exceptionObject == NULL)
4397 goto onError;
4398
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004399 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 if (restuple == NULL)
4401 goto onError;
4402 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 goto onError;
4405 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004406 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408
4409 /* Copy back the bytes variables, which might have been modified by the
4410 callback */
4411 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412 if (!inputobj)
4413 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004414 *input = PyBytes_AS_STRING(inputobj);
4415 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004416 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004417 /* we can DECREF safely, as the exception has another reference,
4418 so the object won't go away. */
4419 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004423 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004424 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427
Victor Stinner170ca6f2013-04-18 00:25:28 +02004428 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004429 if (replen > 1) {
4430 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004431 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004432 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4433 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4434 goto onError;
4435 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004437 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004440 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004443 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449}
4450
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451/* --- UTF-7 Codec -------------------------------------------------------- */
4452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453/* See RFC2152 for details. We encode conservatively and decode liberally. */
4454
4455/* Three simple macros defining base-64. */
4456
4457/* Is c a base-64 character? */
4458
4459#define IS_BASE64(c) \
4460 (((c) >= 'A' && (c) <= 'Z') || \
4461 ((c) >= 'a' && (c) <= 'z') || \
4462 ((c) >= '0' && (c) <= '9') || \
4463 (c) == '+' || (c) == '/')
4464
4465/* given that c is a base-64 character, what is its base-64 value? */
4466
4467#define FROM_BASE64(c) \
4468 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4469 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4470 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4471 (c) == '+' ? 62 : 63)
4472
4473/* What is the base-64 character of the bottom 6 bits of n? */
4474
4475#define TO_BASE64(n) \
4476 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4477
4478/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4479 * decoded as itself. We are permissive on decoding; the only ASCII
4480 * byte not decoding to itself is the + which begins a base64
4481 * string. */
4482
4483#define DECODE_DIRECT(c) \
4484 ((c) <= 127 && (c) != '+')
4485
4486/* The UTF-7 encoder treats ASCII characters differently according to
4487 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4488 * the above). See RFC2152. This array identifies these different
4489 * sets:
4490 * 0 : "Set D"
4491 * alphanumeric and '(),-./:?
4492 * 1 : "Set O"
4493 * !"#$%&*;<=>@[]^_`{|}
4494 * 2 : "whitespace"
4495 * ht nl cr sp
4496 * 3 : special (must be base64 encoded)
4497 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4498 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499
Tim Petersced69f82003-09-16 20:30:58 +00004500static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501char utf7_category[128] = {
4502/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4503 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4504/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4505 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4506/* sp ! " # $ % & ' ( ) * + , - . / */
4507 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4508/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4510/* @ A B C D E F G H I J K L M N O */
4511 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4512/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4514/* ` a b c d e f g h i j k l m n o */
4515 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4516/* p q r s t u v w x y z { | } ~ del */
4517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518};
4519
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520/* ENCODE_DIRECT: this character should be encoded as itself. The
4521 * answer depends on whether we are encoding set O as itself, and also
4522 * on whether we are encoding whitespace as itself. RFC2152 makes it
4523 * clear that the answers to these questions vary between
4524 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004525
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526#define ENCODE_DIRECT(c, directO, directWS) \
4527 ((c) < 128 && (c) > 0 && \
4528 ((utf7_category[(c)] == 0) || \
4529 (directWS && (utf7_category[(c)] == 2)) || \
4530 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Alexander Belopolsky40018472011-02-26 01:02:56 +00004532PyObject *
4533PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004534 Py_ssize_t size,
4535 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4538}
4539
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540/* The decoder. The only state we preserve is our read position,
4541 * i.e. how many characters we have consumed. So if we end in the
4542 * middle of a shift sequence we have to back off the read position
4543 * and the output to the beginning of the sequence, otherwise we lose
4544 * all the shift state (seen bits, number of bits seen, high
4545 * surrogate). */
4546
Alexander Belopolsky40018472011-02-26 01:02:56 +00004547PyObject *
4548PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004549 Py_ssize_t size,
4550 const char *errors,
4551 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t startinpos;
4555 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 const char *errmsg = "";
4559 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004560 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 unsigned int base64bits = 0;
4562 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004563 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004567 if (size == 0) {
4568 if (consumed)
4569 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004570 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004571 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004574 _PyUnicodeWriter_Init(&writer);
4575 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004576
4577 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 e = s + size;
4579
4580 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004581 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004583 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (inShift) { /* in a base-64 section */
4586 if (IS_BASE64(ch)) { /* consume a base-64 character */
4587 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4588 base64bits += 6;
4589 s++;
4590 if (base64bits >= 16) {
4591 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004592 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 base64bits -= 16;
4594 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004595 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 if (surrogate) {
4597 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004598 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4599 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004600 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004601 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004603 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 }
4605 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004606 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004607 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
4610 }
Victor Stinner551ac952011-11-29 22:58:13 +01004611 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 /* first surrogate */
4613 surrogate = outCh;
4614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004616 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004617 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 }
4619 }
4620 }
4621 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 if (base64bits > 0) { /* left-over bits */
4624 if (base64bits >= 6) {
4625 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004626 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 errmsg = "partial character in shift sequence";
4628 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 else {
4631 /* Some bits remain; they should be zero */
4632 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004633 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 errmsg = "non-zero padding bits in shift sequence";
4635 goto utf7Error;
4636 }
4637 }
4638 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004639 if (surrogate && DECODE_DIRECT(ch)) {
4640 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4641 goto onError;
4642 }
4643 surrogate = 0;
4644 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 /* '-' is absorbed; other terminating
4646 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004647 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
4650 }
4651 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 s++; /* consume '+' */
4654 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004656 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004657 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 }
4659 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004661 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004664 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 }
4666 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004669 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 else {
4673 startinpos = s-starts;
4674 s++;
4675 errmsg = "unexpected special character";
4676 goto utf7Error;
4677 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004681 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 errors, &errorHandler,
4683 "utf7", errmsg,
4684 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 }
4688
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 /* end of string */
4690
4691 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4692 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004693 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 if (surrogate ||
4695 (base64bits >= 6) ||
4696 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004698 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 errors, &errorHandler,
4700 "utf7", "unterminated shift sequence",
4701 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004702 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 goto onError;
4704 if (s < e)
4705 goto restart;
4706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708
4709 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004710 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004713 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004714 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004715 writer.kind, writer.data, shiftOutStart);
4716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
4718 _PyUnicodeWriter_Dealloc(&writer);
4719 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004720 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004721 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 }
4723 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004724 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004726 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(errorHandler);
4729 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004730 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(errorHandler);
4734 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 return NULL;
4737}
4738
4739
Alexander Belopolsky40018472011-02-26 01:02:56 +00004740PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741_PyUnicode_EncodeUTF7(PyObject *str,
4742 int base64SetO,
4743 int base64WhiteSpace,
4744 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 int kind;
4747 void *data;
4748 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004749 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 unsigned int base64bits = 0;
4753 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 char * out;
4755 char * start;
4756
Benjamin Petersonbac79492012-01-14 13:34:47 -05004757 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004758 return NULL;
4759 kind = PyUnicode_KIND(str);
4760 data = PyUnicode_DATA(str);
4761 len = PyUnicode_GET_LENGTH(str);
4762
4763 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004767 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004768 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004769 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 if (v == NULL)
4771 return NULL;
4772
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004773 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004774 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004775 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Antoine Pitrou244651a2009-05-04 18:56:13 +00004777 if (inShift) {
4778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779 /* shifting out */
4780 if (base64bits) { /* output remaining bits */
4781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4782 base64buffer = 0;
4783 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
4785 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 /* Characters not in the BASE64 set implicitly unshift the sequence
4787 so no '-' is required, except if the character is itself a '-' */
4788 if (IS_BASE64(ch) || ch == '-') {
4789 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 *out++ = (char) ch;
4792 }
4793 else {
4794 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004795 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004796 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 else { /* not in a shift sequence */
4798 if (ch == '+') {
4799 *out++ = '+';
4800 *out++ = '-';
4801 }
4802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4803 *out++ = (char) ch;
4804 }
4805 else {
4806 *out++ = '+';
4807 inShift = 1;
4808 goto encode_char;
4809 }
4810 }
4811 continue;
4812encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004814 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004815
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* code first surrogate */
4817 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004818 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 while (base64bits >= 6) {
4820 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4821 base64bits -= 6;
4822 }
4823 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004824 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826 base64bits += 16;
4827 base64buffer = (base64buffer << 16) | ch;
4828 while (base64bits >= 6) {
4829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4830 base64bits -= 6;
4831 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004832 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833 if (base64bits)
4834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4835 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004836 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004837 if (_PyBytes_Resize(&v, out - start) < 0)
4838 return NULL;
4839 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841PyObject *
4842PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4843 Py_ssize_t size,
4844 int base64SetO,
4845 int base64WhiteSpace,
4846 const char *errors)
4847{
4848 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004849 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850 if (tmp == NULL)
4851 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004852 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004853 base64WhiteSpace, errors);
4854 Py_DECREF(tmp);
4855 return result;
4856}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004857
Antoine Pitrou244651a2009-05-04 18:56:13 +00004858#undef IS_BASE64
4859#undef FROM_BASE64
4860#undef TO_BASE64
4861#undef DECODE_DIRECT
4862#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864/* --- UTF-8 Codec -------------------------------------------------------- */
4865
Alexander Belopolsky40018472011-02-26 01:02:56 +00004866PyObject *
4867PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald69652032004-09-07 20:24:22 +00004871 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4872}
4873
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874#include "stringlib/asciilib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004878#include "stringlib/ucs1lib.h"
4879#include "stringlib/codecs.h"
4880#include "stringlib/undef.h"
4881
4882#include "stringlib/ucs2lib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
4886#include "stringlib/ucs4lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
Antoine Pitrouab868312009-01-10 15:40:25 +00004890/* Mask to quickly check whether a C 'long' contains a
4891 non-ASCII, UTF8-encoded char. */
4892#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004893# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004894#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004895# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004896#else
4897# error C 'long' size should be either 4 or 8!
4898#endif
4899
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900static Py_ssize_t
4901ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004904 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004906 /*
4907 * Issue #17237: m68k is a bit different from most architectures in
4908 * that objects do not use "natural alignment" - for example, int and
4909 * long are only aligned at 2-byte boundaries. Therefore the assert()
4910 * won't work; also, tests have shown that skipping the "optimised
4911 * version" will even speed up m68k.
4912 */
4913#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004915 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4916 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 /* Fast path, see in STRINGLIB(utf8_decode) for
4918 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004919 /* Help allocation */
4920 const char *_p = p;
4921 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 while (_p < aligned_end) {
4923 unsigned long value = *(const unsigned long *) _p;
4924 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 *((unsigned long *)q) = value;
4927 _p += SIZEOF_LONG;
4928 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004929 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 p = _p;
4931 while (p < end) {
4932 if ((unsigned char)*p & 0x80)
4933 break;
4934 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004939#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 while (p < end) {
4941 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4942 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004943 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004944 /* Help allocation */
4945 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 while (_p < aligned_end) {
4947 unsigned long value = *(unsigned long *) _p;
4948 if (value & ASCII_CHAR_MASK)
4949 break;
4950 _p += SIZEOF_LONG;
4951 }
4952 p = _p;
4953 if (_p == end)
4954 break;
4955 }
4956 if ((unsigned char)*p & 0x80)
4957 break;
4958 ++p;
4959 }
4960 memcpy(dest, start, p - start);
4961 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962}
Antoine Pitrouab868312009-01-10 15:40:25 +00004963
Victor Stinner785938e2011-12-11 20:09:03 +01004964PyObject *
4965PyUnicode_DecodeUTF8Stateful(const char *s,
4966 Py_ssize_t size,
4967 const char *errors,
4968 Py_ssize_t *consumed)
4969{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004970 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004971 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973
4974 Py_ssize_t startinpos;
4975 Py_ssize_t endinpos;
4976 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004977 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004979 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004980
4981 if (size == 0) {
4982 if (consumed)
4983 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004984 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004985 }
4986
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4988 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004989 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 *consumed = 1;
4991 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004992 }
4993
Victor Stinner8f674cc2013-04-17 23:02:17 +02004994 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004995 writer.min_length = size;
4996 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004998
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 writer.pos = ascii_decode(s, end, writer.data);
5000 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 while (s < end) {
5002 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005004
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 if (PyUnicode_IS_ASCII(writer.buffer))
5007 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 } else {
5013 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 }
5016
5017 switch (ch) {
5018 case 0:
5019 if (s == end || consumed)
5020 goto End;
5021 errmsg = "unexpected end of data";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 case 1:
5026 errmsg = "invalid start byte";
5027 startinpos = s - starts;
5028 endinpos = startinpos + 1;
5029 break;
5030 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005031 case 3:
5032 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 errmsg = "invalid continuation byte";
5034 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005035 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 break;
5037 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005038 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 goto onError;
5040 continue;
5041 }
5042
Victor Stinner1d65d912015-10-05 13:43:50 +02005043 if (error_handler == _Py_ERROR_UNKNOWN)
5044 error_handler = get_error_handler(errors);
5045
5046 switch (error_handler) {
5047 case _Py_ERROR_IGNORE:
5048 s += (endinpos - startinpos);
5049 break;
5050
5051 case _Py_ERROR_REPLACE:
5052 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5053 goto onError;
5054 s += (endinpos - startinpos);
5055 break;
5056
5057 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005058 {
5059 Py_ssize_t i;
5060
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5062 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005063 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005064 ch = (Py_UCS4)(unsigned char)(starts[i]);
5065 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5066 ch + 0xdc00);
5067 writer.pos++;
5068 }
5069 s += (endinpos - startinpos);
5070 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005071 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005072
5073 default:
5074 if (unicode_decode_call_errorhandler_writer(
5075 errors, &error_handler_obj,
5076 "utf-8", errmsg,
5077 &starts, &end, &startinpos, &endinpos, &exc, &s,
5078 &writer))
5079 goto onError;
5080 }
Victor Stinner785938e2011-12-11 20:09:03 +01005081 }
5082
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 if (consumed)
5085 *consumed = s - starts;
5086
Victor Stinner1d65d912015-10-05 13:43:50 +02005087 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005089 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090
5091onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005092 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005094 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005096}
5097
Xavier de Gaye76febd02016-12-15 20:59:58 +01005098#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005099
5100/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005101 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005102
5103 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005104 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105
5106wchar_t*
5107_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5108{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 wchar_t *unicode;
5111 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112
5113 /* Note: size will always be longer than the resulting Unicode
5114 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005115 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005117 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118 if (!unicode)
5119 return NULL;
5120
5121 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005123 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005124 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005126#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005127 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 if (ch > 0xFF) {
5132#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005133 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005135 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 /* compute and append the two surrogates: */
5137 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5138 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5139#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 else {
5142 if (!ch && s == e)
5143 break;
5144 /* surrogateescape */
5145 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5146 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005147 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005149 return unicode;
5150}
5151
Xavier de Gaye76febd02016-12-15 20:59:58 +01005152#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154/* Primary internal function which creates utf8 encoded bytes objects.
5155
5156 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005157 and allocate exactly as much space needed at the end. Else allocate the
5158 maximum possible needed (4 result bytes per Unicode character), and return
5159 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005160*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005161PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005162_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Victor Stinner6099a032011-12-18 14:22:26 +01005164 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 void *data;
5166 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 if (!PyUnicode_Check(unicode)) {
5169 PyErr_BadArgument();
5170 return NULL;
5171 }
5172
5173 if (PyUnicode_READY(unicode) == -1)
5174 return NULL;
5175
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005176 if (PyUnicode_UTF8(unicode))
5177 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5178 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179
5180 kind = PyUnicode_KIND(unicode);
5181 data = PyUnicode_DATA(unicode);
5182 size = PyUnicode_GET_LENGTH(unicode);
5183
Benjamin Petersonead6b532011-12-20 17:23:42 -06005184 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005185 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005186 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005187 case PyUnicode_1BYTE_KIND:
5188 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5189 assert(!PyUnicode_IS_ASCII(unicode));
5190 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5191 case PyUnicode_2BYTE_KIND:
5192 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5193 case PyUnicode_4BYTE_KIND:
5194 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196}
5197
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5200 Py_ssize_t size,
5201 const char *errors)
5202{
5203 PyObject *v, *unicode;
5204
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005205 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206 if (unicode == NULL)
5207 return NULL;
5208 v = _PyUnicode_AsUTF8String(unicode, errors);
5209 Py_DECREF(unicode);
5210 return v;
5211}
5212
5213PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005214PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005216 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217}
5218
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219/* --- UTF-32 Codec ------------------------------------------------------- */
5220
5221PyObject *
5222PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 Py_ssize_t size,
5224 const char *errors,
5225 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226{
5227 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5228}
5229
5230PyObject *
5231PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236{
5237 const char *starts = s;
5238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005241 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005242 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 PyObject *errorHandler = NULL;
5246 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005247
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 q = (unsigned char *)s;
5249 e = q + size;
5250
5251 if (byteorder)
5252 bo = *byteorder;
5253
5254 /* Check for BOM marks (U+FEFF) in the input and adjust current
5255 byte order setting accordingly. In native mode, the leading BOM
5256 mark is skipped, in all other modes, it is copied to the output
5257 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005258 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005259 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (bom == 0x0000FEFF) {
5261 bo = -1;
5262 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 else if (bom == 0xFFFE0000) {
5265 bo = 1;
5266 q += 4;
5267 }
5268 if (byteorder)
5269 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005270 }
5271
Victor Stinnere64322e2012-10-30 23:12:47 +01005272 if (q == e) {
5273 if (consumed)
5274 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005275 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276 }
5277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278#ifdef WORDS_BIGENDIAN
5279 le = bo < 0;
5280#else
5281 le = bo <= 0;
5282#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005283 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005284
Victor Stinner8f674cc2013-04-17 23:02:17 +02005285 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005286 writer.min_length = (e - q + 3) / 4;
5287 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005288 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005289
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 while (1) {
5291 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005293
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 enum PyUnicode_Kind kind = writer.kind;
5296 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005298 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 if (le) {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
5311 else {
5312 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005313 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (ch > maxch)
5315 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 if (kind != PyUnicode_1BYTE_KIND &&
5317 Py_UNICODE_IS_SURROGATE(ch))
5318 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005319 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 q += 4;
5321 } while (q <= last);
5322 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 }
5325
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005326 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005327 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005328 startinpos = ((const char *)q) - starts;
5329 endinpos = startinpos + 4;
5330 }
5331 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005334 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 startinpos = ((const char *)q) - starts;
5337 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 else {
5340 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005341 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005342 goto onError;
5343 q += 4;
5344 continue;
5345 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005346 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005347 startinpos = ((const char *)q) - starts;
5348 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005350
5351 /* The remaining input chars are ignored if the callback
5352 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005353 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005355 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 }
5360
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005369 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 Py_XDECREF(errorHandler);
5371 Py_XDECREF(exc);
5372 return NULL;
5373}
5374
5375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005376_PyUnicode_EncodeUTF32(PyObject *str,
5377 const char *errors,
5378 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 enum PyUnicode_Kind kind;
5381 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005382 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005383 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005384 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005385#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005386 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005389#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005392 PyObject *errorHandler = NULL;
5393 PyObject *exc = NULL;
5394 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 if (!PyUnicode_Check(str)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005400 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005401 return NULL;
5402 kind = PyUnicode_KIND(str);
5403 data = PyUnicode_DATA(str);
5404 len = PyUnicode_GET_LENGTH(str);
5405
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005407 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005409 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005410 if (v == NULL)
5411 return NULL;
5412
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 /* output buffer is 4-bytes aligned */
5414 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005415 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005419 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005422 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 else
5426 encoding = "utf-32";
5427
5428 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5430 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 pos = 0;
5434 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436
5437 if (kind == PyUnicode_2BYTE_KIND) {
5438 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5439 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005440 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 else {
5442 assert(kind == PyUnicode_4BYTE_KIND);
5443 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5444 &out, native_ordering);
5445 }
5446 if (pos == len)
5447 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005448
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 rep = unicode_encode_call_errorhandler(
5450 errors, &errorHandler,
5451 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005452 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 if (!rep)
5454 goto error;
5455
5456 if (PyBytes_Check(rep)) {
5457 repsize = PyBytes_GET_SIZE(rep);
5458 if (repsize & 3) {
5459 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 "surrogates not allowed");
5462 goto error;
5463 }
5464 moreunits = repsize / 4;
5465 }
5466 else {
5467 assert(PyUnicode_Check(rep));
5468 if (PyUnicode_READY(rep) < 0)
5469 goto error;
5470 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5471 if (!PyUnicode_IS_ASCII(rep)) {
5472 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005474 "surrogates not allowed");
5475 goto error;
5476 }
5477 }
5478
5479 /* four bytes are reserved for each surrogate */
5480 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005481 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005482 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 /* integer overflow */
5484 PyErr_NoMemory();
5485 goto error;
5486 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005487 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005489 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 }
5491
5492 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005493 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005494 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5498 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 }
5500
5501 Py_CLEAR(rep);
5502 }
5503
5504 /* Cut back to size actually needed. This is necessary for, for example,
5505 encoding of a string containing isolated surrogates and the 'ignore'
5506 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005507 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 if (nsize != PyBytes_GET_SIZE(v))
5509 _PyBytes_Resize(&v, nsize);
5510 Py_XDECREF(errorHandler);
5511 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005512 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 error:
5515 Py_XDECREF(rep);
5516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
5518 Py_XDECREF(v);
5519 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520}
5521
Alexander Belopolsky40018472011-02-26 01:02:56 +00005522PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5524 Py_ssize_t size,
5525 const char *errors,
5526 int byteorder)
5527{
5528 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005529 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530 if (tmp == NULL)
5531 return NULL;
5532 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5533 Py_DECREF(tmp);
5534 return result;
5535}
5536
5537PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539{
Victor Stinnerb960b342011-11-20 19:12:52 +01005540 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005541}
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543/* --- UTF-16 Codec ------------------------------------------------------- */
5544
Tim Peters772747b2001-08-09 22:21:55 +00005545PyObject *
5546PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 Py_ssize_t size,
5548 const char *errors,
5549 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Walter Dörwald69652032004-09-07 20:24:22 +00005551 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5552}
5553
5554PyObject *
5555PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 Py_ssize_t size,
5557 const char *errors,
5558 int *byteorder,
5559 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t startinpos;
5563 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005565 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005566 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005568 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 PyObject *errorHandler = NULL;
5570 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
Tim Peters772747b2001-08-09 22:21:55 +00005573 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
5576 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005577 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005579 /* Check for BOM marks (U+FEFF) in the input and adjust current
5580 byte order setting accordingly. In native mode, the leading BOM
5581 mark is skipped, in all other modes, it is copied to the output
5582 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583 if (bo == 0 && size >= 2) {
5584 const Py_UCS4 bom = (q[1] << 8) | q[0];
5585 if (bom == 0xFEFF) {
5586 q += 2;
5587 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 else if (bom == 0xFFFE) {
5590 q += 2;
5591 bo = 1;
5592 }
5593 if (byteorder)
5594 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 if (q == e) {
5598 if (consumed)
5599 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005600 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005601 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602
Christian Heimes743e0cd2012-10-17 23:52:17 +02005603#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005605 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005606#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005609#endif
Tim Peters772747b2001-08-09 22:21:55 +00005610
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 /* Note: size will always be longer than the resulting Unicode
5612 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005613 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005614 writer.min_length = (e - q + 1) / 2;
5615 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005616 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 while (1) {
5619 Py_UCS4 ch = 0;
5620 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 native_ordering);
5627 else
5628 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 native_ordering);
5631 } else if (kind == PyUnicode_2BYTE_KIND) {
5632 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 } else {
5636 assert(kind == PyUnicode_4BYTE_KIND);
5637 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005640 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 switch (ch)
5644 {
5645 case 0:
5646 /* remaining byte at the end? (size should be even) */
5647 if (q == e || consumed)
5648 goto End;
5649 errmsg = "truncated data";
5650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) - starts;
5652 break;
5653 /* The remaining input chars are ignored if the callback
5654 chooses to skip the input */
5655 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005656 q -= 2;
5657 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005658 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005660 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 endinpos = ((const char *)e) - starts;
5662 break;
5663 case 2:
5664 errmsg = "illegal encoding";
5665 startinpos = ((const char *)q) - 2 - starts;
5666 endinpos = startinpos + 2;
5667 break;
5668 case 3:
5669 errmsg = "illegal UTF-16 surrogate";
5670 startinpos = ((const char *)q) - 4 - starts;
5671 endinpos = startinpos + 2;
5672 break;
5673 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005674 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 continue;
5677 }
5678
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005680 errors,
5681 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005682 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005683 &starts,
5684 (const char **)&e,
5685 &startinpos,
5686 &endinpos,
5687 &exc,
5688 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692
Antoine Pitrou63065d72012-05-15 23:48:04 +02005693End:
Walter Dörwald69652032004-09-07 20:24:22 +00005694 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005702 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 Py_XDECREF(errorHandler);
5704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706}
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005713 enum PyUnicode_Kind kind;
5714 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005716 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005719#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005721#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005723#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 const char *encoding;
5725 Py_ssize_t nsize, pos;
5726 PyObject *errorHandler = NULL;
5727 PyObject *exc = NULL;
5728 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 if (!PyUnicode_Check(str)) {
5731 PyErr_BadArgument();
5732 return NULL;
5733 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 return NULL;
5736 kind = PyUnicode_KIND(str);
5737 data = PyUnicode_DATA(str);
5738 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005741 if (kind == PyUnicode_4BYTE_KIND) {
5742 const Py_UCS4 *in = (const Py_UCS4 *)data;
5743 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 while (in < end) {
5745 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005746 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
5748 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 nsize = len + pairs + (byteorder == 0);
5754 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005760 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
5765 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005766 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 }
Tim Peters772747b2001-08-09 22:21:55 +00005768
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 if (kind == PyUnicode_1BYTE_KIND) {
5770 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5771 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
5777 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 }
5780 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783
5784 pos = 0;
5785 while (pos < len) {
5786 Py_ssize_t repsize, moreunits;
5787
5788 if (kind == PyUnicode_2BYTE_KIND) {
5789 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5790 &out, native_ordering);
5791 }
5792 else {
5793 assert(kind == PyUnicode_4BYTE_KIND);
5794 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5795 &out, native_ordering);
5796 }
5797 if (pos == len)
5798 break;
5799
5800 rep = unicode_encode_call_errorhandler(
5801 errors, &errorHandler,
5802 encoding, "surrogates not allowed",
5803 str, &exc, pos, pos + 1, &pos);
5804 if (!rep)
5805 goto error;
5806
5807 if (PyBytes_Check(rep)) {
5808 repsize = PyBytes_GET_SIZE(rep);
5809 if (repsize & 1) {
5810 raise_encode_exception(&exc, encoding,
5811 str, pos - 1, pos,
5812 "surrogates not allowed");
5813 goto error;
5814 }
5815 moreunits = repsize / 2;
5816 }
5817 else {
5818 assert(PyUnicode_Check(rep));
5819 if (PyUnicode_READY(rep) < 0)
5820 goto error;
5821 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5822 if (!PyUnicode_IS_ASCII(rep)) {
5823 raise_encode_exception(&exc, encoding,
5824 str, pos - 1, pos,
5825 "surrogates not allowed");
5826 goto error;
5827 }
5828 }
5829
5830 /* two bytes are reserved for each surrogate */
5831 if (moreunits > 1) {
5832 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005833 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 /* integer overflow */
5835 PyErr_NoMemory();
5836 goto error;
5837 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005838 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 goto error;
5840 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5841 }
5842
5843 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005844 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 out += moreunits;
5846 } else /* rep is unicode */ {
5847 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5848 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5849 &out, native_ordering);
5850 }
5851
5852 Py_CLEAR(rep);
5853 }
5854
5855 /* Cut back to size actually needed. This is necessary for, for example,
5856 encoding of a string containing isolated surrogates and the 'ignore' handler
5857 is used. */
5858 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5859 if (nsize != PyBytes_GET_SIZE(v))
5860 _PyBytes_Resize(&v, nsize);
5861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005863 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005864 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005865 error:
5866 Py_XDECREF(rep);
5867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
5869 Py_XDECREF(v);
5870 return NULL;
5871#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872}
5873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5876 Py_ssize_t size,
5877 const char *errors,
5878 int byteorder)
5879{
5880 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005881 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 if (tmp == NULL)
5883 return NULL;
5884 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5885 Py_DECREF(tmp);
5886 return result;
5887}
5888
5889PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893}
5894
5895/* --- Unicode Escape Codec ----------------------------------------------- */
5896
Fredrik Lundh06d12682001-01-24 07:59:11 +00005897static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005900_PyUnicode_DecodeUnicodeEscape(const char *s,
5901 Py_ssize_t size,
5902 const char *errors,
5903 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 PyObject *errorHandler = NULL;
5909 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005910
Eric V. Smith42454af2016-10-31 09:22:08 -04005911 // so we can remember if we've seen an invalid escape char or not
5912 *first_invalid_escape = NULL;
5913
Victor Stinner62ec3312016-09-06 17:04:34 -07005914 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005915 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005916 }
5917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
5919 length after conversion to the true value.
5920 (but if the error callback returns a long replacement string
5921 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005922 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005923 writer.min_length = size;
5924 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5925 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005926 }
5927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 end = s + size;
5929 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005930 unsigned char c = (unsigned char) *s++;
5931 Py_UCS4 ch;
5932 int count;
5933 Py_ssize_t startinpos;
5934 Py_ssize_t endinpos;
5935 const char *message;
5936
5937#define WRITE_ASCII_CHAR(ch) \
5938 do { \
5939 assert(ch <= 127); \
5940 assert(writer.pos < writer.size); \
5941 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5942 } while(0)
5943
5944#define WRITE_CHAR(ch) \
5945 do { \
5946 if (ch <= writer.maxchar) { \
5947 assert(writer.pos < writer.size); \
5948 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5949 } \
5950 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5951 goto onError; \
5952 } \
5953 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
5955 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 if (c != '\\') {
5957 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 continue;
5959 }
5960
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 if (s >= end) {
5964 message = "\\ at end of string";
5965 goto error;
5966 }
5967 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005970 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 case '\n': continue;
5974 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5975 case '\'': WRITE_ASCII_CHAR('\''); continue;
5976 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5977 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005978 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5980 case 't': WRITE_ASCII_CHAR('\t'); continue;
5981 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5982 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 case '0': case '1': case '2': case '3':
5990 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005992 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 ch = (ch<<3) + *s++ - '0';
5994 if (s < end && '0' <= *s && *s <= '7') {
5995 ch = (ch<<3) + *s++ - '0';
5996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 WRITE_CHAR(ch);
5999 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* hex escapes */
6002 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006004 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006005 message = "truncated \\xXX escape";
6006 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006011 message = "truncated \\uXXXX escape";
6012 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006015 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006017 message = "truncated \\UXXXXXXXX escape";
6018 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006019 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006020 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006021 ch <<= 4;
6022 if (c >= '0' && c <= '9') {
6023 ch += c - '0';
6024 }
6025 else if (c >= 'a' && c <= 'f') {
6026 ch += c - ('a' - 10);
6027 }
6028 else if (c >= 'A' && c <= 'F') {
6029 ch += c - ('A' - 10);
6030 }
6031 else {
6032 break;
6033 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006034 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006035 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006036 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 }
6038
6039 /* when we get here, ch is a 32-bit unicode character */
6040 if (ch > MAX_UNICODE) {
6041 message = "illegal Unicode character";
6042 goto error;
6043 }
6044
6045 WRITE_CHAR(ch);
6046 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 if (ucnhash_CAPI == NULL) {
6051 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6053 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 if (ucnhash_CAPI == NULL) {
6055 PyErr_SetString(
6056 PyExc_UnicodeError,
6057 "\\N escapes not supported (can't load unicodedata module)"
6058 );
6059 goto onError;
6060 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006061 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006062
6063 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 const char *start = ++s;
6066 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006067 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 namelen = s - start;
6071 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 ch = 0xffffffff; /* in case 'getcode' messes up */
6075 if (namelen <= INT_MAX &&
6076 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6077 &ch, 0)) {
6078 assert(ch <= MAX_UNICODE);
6079 WRITE_CHAR(ch);
6080 continue;
6081 }
6082 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083 }
6084 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006085 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006086
6087 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006088 if (*first_invalid_escape == NULL) {
6089 *first_invalid_escape = s-1; /* Back up one char, since we've
6090 already incremented s. */
6091 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 WRITE_ASCII_CHAR('\\');
6093 WRITE_CHAR(c);
6094 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006096
6097 error:
6098 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006099 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006100 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006101 errors, &errorHandler,
6102 "unicodeescape", message,
6103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006104 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006105 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006106 }
6107 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6108 goto onError;
6109 }
6110
6111#undef WRITE_ASCII_CHAR
6112#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006118
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006120 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return NULL;
6124}
6125
Eric V. Smith42454af2016-10-31 09:22:08 -04006126PyObject *
6127PyUnicode_DecodeUnicodeEscape(const char *s,
6128 Py_ssize_t size,
6129 const char *errors)
6130{
6131 const char *first_invalid_escape;
6132 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6133 &first_invalid_escape);
6134 if (result == NULL)
6135 return NULL;
6136 if (first_invalid_escape != NULL) {
6137 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6138 "invalid escape sequence '\\%c'",
6139 *first_invalid_escape) < 0) {
6140 Py_DECREF(result);
6141 return NULL;
6142 }
6143 }
6144 return result;
6145}
6146
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006147/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Alexander Belopolsky40018472011-02-26 01:02:56 +00006149PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Ezio Melottie7f90372012-10-05 03:33:31 +03006159 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006160 escape.
6161
Ezio Melottie7f90372012-10-05 03:33:31 +03006162 For UCS1 strings it's '\xxx', 4 bytes per source character.
6163 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6164 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006165 */
6166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 if (!PyUnicode_Check(unicode)) {
6168 PyErr_BadArgument();
6169 return NULL;
6170 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 }
Victor Stinner358af132015-10-12 22:36:57 +02006174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 if (len == 0) {
6177 return PyBytes_FromStringAndSize(NULL, 0);
6178 }
6179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 kind = PyUnicode_KIND(unicode);
6181 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6183 bytes, and 1 byte characters 4. */
6184 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006185 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 return PyErr_NoMemory();
6187 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006188 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (repr == NULL) {
6190 return NULL;
6191 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006195 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 /* U+0000-U+00ff range */
6198 if (ch < 0x100) {
6199 if (ch >= ' ' && ch < 127) {
6200 if (ch != '\\') {
6201 /* Copy printable US ASCII as-is */
6202 *p++ = (char) ch;
6203 }
6204 /* Escape backslashes */
6205 else {
6206 *p++ = '\\';
6207 *p++ = '\\';
6208 }
6209 }
Victor Stinner358af132015-10-12 22:36:57 +02006210
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 /* Map special whitespace to '\t', \n', '\r' */
6212 else if (ch == '\t') {
6213 *p++ = '\\';
6214 *p++ = 't';
6215 }
6216 else if (ch == '\n') {
6217 *p++ = '\\';
6218 *p++ = 'n';
6219 }
6220 else if (ch == '\r') {
6221 *p++ = '\\';
6222 *p++ = 'r';
6223 }
6224
6225 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6226 else {
6227 *p++ = '\\';
6228 *p++ = 'x';
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
6231 }
Tim Petersced69f82003-09-16 20:30:58 +00006232 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006233 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 *p++ = '\\';
6236 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006237 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6238 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6243 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 /* Make sure that the first two digits are zero */
6246 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 *p++ = 'U';
6249 *p++ = '0';
6250 *p++ = '0';
6251 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6256 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 assert(p - PyBytes_AS_STRING(repr) > 0);
6261 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6262 return NULL;
6263 }
6264 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006271 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006272 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006273 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 }
6276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 result = PyUnicode_AsUnicodeEscapeString(tmp);
6278 Py_DECREF(tmp);
6279 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
6282/* --- Raw Unicode Escape Codec ------------------------------------------- */
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006286 Py_ssize_t size,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006290 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 PyObject *errorHandler = NULL;
6293 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006294
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006296 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 /* Escaped strings will always be longer than the resulting
6300 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 length after conversion to the true value. (But decoding error
6302 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006303 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 writer.min_length = size;
6305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6306 goto onError;
6307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 end = s + size;
6310 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 unsigned char c = (unsigned char) *s++;
6312 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006313 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 Py_ssize_t startinpos;
6315 Py_ssize_t endinpos;
6316 const char *message;
6317
6318#define WRITE_CHAR(ch) \
6319 do { \
6320 if (ch <= writer.maxchar) { \
6321 assert(writer.pos < writer.size); \
6322 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6323 } \
6324 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6325 goto onError; \
6326 } \
6327 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (c != '\\' || s >= end) {
6331 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006334
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 c = (unsigned char) *s++;
6336 if (c == 'u') {
6337 count = 4;
6338 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 else if (c == 'U') {
6341 count = 8;
6342 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 }
6344 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 assert(writer.pos < writer.size);
6346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6347 WRITE_CHAR(c);
6348 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 startinpos = s - starts - 2;
6351
6352 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6353 for (ch = 0; count && s < end; ++s, --count) {
6354 c = (unsigned char)*s;
6355 ch <<= 4;
6356 if (c >= '0' && c <= '9') {
6357 ch += c - '0';
6358 }
6359 else if (c >= 'a' && c <= 'f') {
6360 ch += c - ('a' - 10);
6361 }
6362 else if (c >= 'A' && c <= 'F') {
6363 ch += c - ('A' - 10);
6364 }
6365 else {
6366 break;
6367 }
6368 }
6369 if (!count) {
6370 if (ch <= MAX_UNICODE) {
6371 WRITE_CHAR(ch);
6372 continue;
6373 }
6374 message = "\\Uxxxxxxxx out of range";
6375 }
6376
6377 endinpos = s-starts;
6378 writer.min_length = end - s + writer.pos;
6379 if (unicode_decode_call_errorhandler_writer(
6380 errors, &errorHandler,
6381 "rawunicodeescape", message,
6382 &starts, &end, &startinpos, &endinpos, &exc, &s,
6383 &writer)) {
6384 goto onError;
6385 }
6386 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6387 goto onError;
6388 }
6389
6390#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006394 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006395
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402}
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404
Alexander Belopolsky40018472011-02-26 01:02:56 +00006405PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 int kind;
6412 void *data;
6413 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 if (!PyUnicode_Check(unicode)) {
6416 PyErr_BadArgument();
6417 return NULL;
6418 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422 kind = PyUnicode_KIND(unicode);
6423 data = PyUnicode_DATA(unicode);
6424 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 if (kind == PyUnicode_1BYTE_KIND) {
6426 return PyBytes_FromStringAndSize(data, len);
6427 }
Victor Stinner0e368262011-11-10 20:12:49 +01006428
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6430 bytes, and 1 byte characters 4. */
6431 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 if (len > PY_SSIZE_T_MAX / expandsize) {
6434 return PyErr_NoMemory();
6435 }
6436 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6437 if (repr == NULL) {
6438 return NULL;
6439 }
6440 if (len == 0) {
6441 return repr;
6442 }
6443
6444 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 for (pos = 0; pos < len; pos++) {
6446 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006447
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6449 if (ch < 0x100) {
6450 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006451 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6453 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 *p++ = '\\';
6455 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006456 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6459 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6462 else {
6463 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6464 *p++ = '\\';
6465 *p++ = 'U';
6466 *p++ = '0';
6467 *p++ = '0';
6468 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6473 *p++ = Py_hexdigits[ch & 15];
6474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 assert(p > PyBytes_AS_STRING(repr));
6478 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6479 return NULL;
6480 }
6481 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
Alexander Belopolsky40018472011-02-26 01:02:56 +00006484PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6486 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006489 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006491 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6493 Py_DECREF(tmp);
6494 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497/* --- Unicode Internal Codec ------------------------------------------- */
6498
Alexander Belopolsky40018472011-02-26 01:02:56 +00006499PyObject *
6500_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006501 Py_ssize_t size,
6502 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006503{
6504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t startinpos;
6506 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006507 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508 const char *end;
6509 const char *reason;
6510 PyObject *errorHandler = NULL;
6511 PyObject *exc = NULL;
6512
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006513 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006514 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006515 1))
6516 return NULL;
6517
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006518 if (size < 0) {
6519 PyErr_BadInternalCall();
6520 return NULL;
6521 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006522 if (size == 0)
6523 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006524
Victor Stinner8f674cc2013-04-17 23:02:17 +02006525 _PyUnicodeWriter_Init(&writer);
6526 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6527 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 }
6530 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006531
Victor Stinner8f674cc2013-04-17 23:02:17 +02006532 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006533 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006534 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006535 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006536 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006537 endinpos = end-starts;
6538 reason = "truncated input";
6539 goto error;
6540 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006541 /* We copy the raw representation one byte at a time because the
6542 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006543 ((char *) &uch)[0] = s[0];
6544 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 ((char *) &uch)[2] = s[2];
6547 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006548#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006550#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551 /* We have to sanity check the raw data, otherwise doom looms for
6552 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006553 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554 endinpos = s - starts + Py_UNICODE_SIZE;
6555 reason = "illegal code point (> 0x10FFFF)";
6556 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006557 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 s += Py_UNICODE_SIZE;
6560#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006561 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006563 Py_UNICODE uch2;
6564 ((char *) &uch2)[0] = s[0];
6565 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006566 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 {
Victor Stinner551ac952011-11-29 22:58:13 +01006568 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006569 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 }
6571 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572#endif
6573
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006574 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006576 continue;
6577
6578 error:
6579 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006580 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006581 errors, &errorHandler,
6582 "unicode_internal", reason,
6583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006586 }
6587
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006588 Py_XDECREF(errorHandler);
6589 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006590 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006591
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006593 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
6596 return NULL;
6597}
6598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599/* --- Latin-1 Codec ------------------------------------------------------ */
6600
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601PyObject *
6602PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006603 Py_ssize_t size,
6604 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006607 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611static void
6612make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006614 PyObject *unicode,
6615 Py_ssize_t startpos, Py_ssize_t endpos,
6616 const char *reason)
6617{
6618 if (*exceptionObject == NULL) {
6619 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006621 encoding, unicode, startpos, endpos, reason);
6622 }
6623 else {
6624 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6625 goto onError;
6626 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6627 goto onError;
6628 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6629 goto onError;
6630 return;
6631 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006632 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006633 }
6634}
6635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637static void
6638raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006639 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006640 PyObject *unicode,
6641 Py_ssize_t startpos, Py_ssize_t endpos,
6642 const char *reason)
6643{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006644 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006645 encoding, unicode, startpos, endpos, reason);
6646 if (*exceptionObject != NULL)
6647 PyCodec_StrictErrors(*exceptionObject);
6648}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649
6650/* error handling callback helper:
6651 build arguments, call the callback and check the arguments,
6652 put the result into newpos and return the replacement string, which
6653 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654static PyObject *
6655unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 PyObject **errorHandler,
6657 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 Py_ssize_t startpos, Py_ssize_t endpos,
6660 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006662 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 PyObject *restuple;
6665 PyObject *resunicode;
6666
6667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 }
6672
Benjamin Petersonbac79492012-01-14 13:34:47 -05006673 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return NULL;
6675 len = PyUnicode_GET_LENGTH(unicode);
6676
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006677 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006682 restuple = PyObject_CallFunctionObjArgs(
6683 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006687 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 Py_DECREF(restuple);
6689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 &resunicode, newpos)) {
6693 Py_DECREF(restuple);
6694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006696 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6697 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6698 Py_DECREF(restuple);
6699 return NULL;
6700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 *newpos = len + *newpos;
6703 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006704 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 Py_DECREF(restuple);
6706 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 Py_INCREF(resunicode);
6709 Py_DECREF(restuple);
6710 return resunicode;
6711}
6712
Alexander Belopolsky40018472011-02-26 01:02:56 +00006713static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006715 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006716 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 /* input state */
6719 Py_ssize_t pos=0, size;
6720 int kind;
6721 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 /* pointer into the output */
6723 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006724 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6725 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006726 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006728 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006729 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006730 /* output object */
6731 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732
Benjamin Petersonbac79492012-01-14 13:34:47 -05006733 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 return NULL;
6735 size = PyUnicode_GET_LENGTH(unicode);
6736 kind = PyUnicode_KIND(unicode);
6737 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 /* allocate enough for a simple encoding without
6739 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006740 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006741 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006742
6743 _PyBytesWriter_Init(&writer);
6744 str = _PyBytesWriter_Alloc(&writer, size);
6745 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006746 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006749 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006752 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006754 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006756 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006758 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006761 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006763
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006764 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006766
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006767 /* Only overallocate the buffer if it's not the last write */
6768 writer.overallocate = (collend < size);
6769
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006771 if (error_handler == _Py_ERROR_UNKNOWN)
6772 error_handler = get_error_handler(errors);
6773
6774 switch (error_handler) {
6775 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006776 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006778
6779 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006780 memset(str, '?', collend - collstart);
6781 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006782 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006783 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 break;
Victor Stinner50149202015-09-22 00:26:54 +02006786
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006788 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006789 writer.min_size -= (collend - collstart);
6790 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006792 if (str == NULL)
6793 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006794 pos = collend;
6795 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006797 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006798 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006799 writer.min_size -= (collend - collstart);
6800 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 unicode, collstart, collend);
6802 if (str == NULL)
6803 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 break;
Victor Stinner50149202015-09-22 00:26:54 +02006806
Victor Stinnerc3713e92015-09-29 12:32:13 +02006807 case _Py_ERROR_SURROGATEESCAPE:
6808 for (i = collstart; i < collend; ++i) {
6809 ch = PyUnicode_READ(kind, data, i);
6810 if (ch < 0xdc80 || 0xdcff < ch) {
6811 /* Not a UTF-8b surrogate */
6812 break;
6813 }
6814 *str++ = (char)(ch - 0xdc00);
6815 ++pos;
6816 }
6817 if (i >= collend)
6818 break;
6819 collstart = pos;
6820 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006821 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006822
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006824 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6825 encoding, reason, unicode, &exc,
6826 collstart, collend, &newpos);
6827 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006829
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006830 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006831 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006832
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006834 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006835 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006836 PyBytes_AS_STRING(rep),
6837 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006838 if (str == NULL)
6839 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006840 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 else {
6842 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006843
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006846
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006847 if (limit == 256 ?
6848 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6849 !PyUnicode_IS_ASCII(rep))
6850 {
6851 /* Not all characters are smaller than limit */
6852 raise_encode_exception(&exc, encoding, unicode,
6853 collstart, collend, reason);
6854 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006856 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6857 str = _PyBytesWriter_WriteBytes(&writer, str,
6858 PyUnicode_DATA(rep),
6859 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006861 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006863 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006864
6865 /* If overallocation was disabled, ensure that it was the last
6866 write. Otherwise, we missed an optimization */
6867 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868 }
6869 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006870
Victor Stinner50149202015-09-22 00:26:54 +02006871 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006873 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874
6875 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006876 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006877 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006878 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879 Py_XDECREF(exc);
6880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881}
6882
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006886 Py_ssize_t size,
6887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006890 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 PyErr_BadArgument();
6903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006931 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006932 int kind;
6933 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006938 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006943 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006946 if (size == 1 && (unsigned char)s[0] < 128)
6947 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006948
Victor Stinner8f674cc2013-04-17 23:02:17 +02006949 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006950 writer.min_length = size;
6951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006952 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006955 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006956 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006957 writer.pos = outpos;
6958 if (writer.pos == size)
6959 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 s += writer.pos;
6962 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006964 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 PyUnicode_WRITE(kind, data, writer.pos, c);
6967 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006971
6972 /* byte outsize range 0x00..0x7f: call the error handler */
6973
6974 if (error_handler == _Py_ERROR_UNKNOWN)
6975 error_handler = get_error_handler(errors);
6976
6977 switch (error_handler)
6978 {
6979 case _Py_ERROR_REPLACE:
6980 case _Py_ERROR_SURROGATEESCAPE:
6981 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006982 but we may switch to UCS2 at the first write */
6983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984 goto onError;
6985 kind = writer.kind;
6986 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987
6988 if (error_handler == _Py_ERROR_REPLACE)
6989 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990 else
6991 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992 writer.pos++;
6993 ++s;
6994 break;
6995
6996 case _Py_ERROR_IGNORE:
6997 ++s;
6998 break;
6999
7000 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 startinpos = s-starts;
7002 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007003 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007004 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 "ascii", "ordinal not in range(128)",
7006 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 kind = writer.kind;
7010 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007013 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007015 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007016
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007019 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return NULL;
7022}
7023
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007024/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
7026PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027 Py_ssize_t size,
7028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007030 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007031 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 if (unicode == NULL)
7033 return NULL;
7034 result = unicode_encode_ucs1(unicode, errors, 128);
7035 Py_DECREF(unicode);
7036 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037}
7038
Alexander Belopolsky40018472011-02-26 01:02:56 +00007039PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041{
7042 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 PyErr_BadArgument();
7044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007046 if (PyUnicode_READY(unicode) == -1)
7047 return NULL;
7048 /* Fast path: if it is an ASCII-only string, construct bytes object
7049 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007050 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007054}
7055
7056PyObject *
7057PyUnicode_AsASCIIString(PyObject *unicode)
7058{
7059 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060}
7061
Steve Dowercc16be82016-09-08 10:35:16 -07007062#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007063
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007064/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007065
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007066#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067#define NEED_RETRY
7068#endif
7069
Victor Stinner3a50e702011-10-18 21:21:00 +02007070#ifndef WC_ERR_INVALID_CHARS
7071# define WC_ERR_INVALID_CHARS 0x0080
7072#endif
7073
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007074static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007075code_page_name(UINT code_page, PyObject **obj)
7076{
7077 *obj = NULL;
7078 if (code_page == CP_ACP)
7079 return "mbcs";
7080 if (code_page == CP_UTF7)
7081 return "CP_UTF7";
7082 if (code_page == CP_UTF8)
7083 return "CP_UTF8";
7084
7085 *obj = PyBytes_FromFormat("cp%u", code_page);
7086 if (*obj == NULL)
7087 return NULL;
7088 return PyBytes_AS_STRING(*obj);
7089}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090
Victor Stinner3a50e702011-10-18 21:21:00 +02007091static DWORD
7092decode_code_page_flags(UINT code_page)
7093{
7094 if (code_page == CP_UTF7) {
7095 /* The CP_UTF7 decoder only supports flags=0 */
7096 return 0;
7097 }
7098 else
7099 return MB_ERR_INVALID_CHARS;
7100}
7101
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 * Decode a byte string from a Windows code page into unicode object in strict
7104 * mode.
7105 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007106 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007109static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007110decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 const char *in,
7113 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114{
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007116 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118
7119 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 assert(insize > 0);
7121 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122 if (outsize <= 0)
7123 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124
7125 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007127 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007128 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 if (*v == NULL)
7130 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132 }
7133 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007136 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139 }
7140
7141 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143 if (outsize <= 0)
7144 goto error;
7145 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007146
Victor Stinner3a50e702011-10-18 21:21:00 +02007147error:
7148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149 return -2;
7150 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007151 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152}
7153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154/*
7155 * Decode a byte string from a code page into unicode object with an error
7156 * handler.
7157 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007158 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 * UnicodeDecodeError exception and returns -1 on error.
7160 */
7161static int
7162decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007163 PyObject **v,
7164 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007165 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007166{
7167 const char *startin = in;
7168 const char *endin = in + size;
7169 const DWORD flags = decode_code_page_flags(code_page);
7170 /* Ideally, we should get reason from FormatMessage. This is the Windows
7171 2000 English version of the message. */
7172 const char *reason = "No mapping for the Unicode character exists "
7173 "in the target code page.";
7174 /* each step cannot decode more than 1 character, but a character can be
7175 represented as a surrogate pair */
7176 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007177 int insize;
7178 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 PyObject *errorHandler = NULL;
7180 PyObject *exc = NULL;
7181 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007182 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 DWORD err;
7184 int ret = -1;
7185
7186 assert(size > 0);
7187
7188 encoding = code_page_name(code_page, &encoding_obj);
7189 if (encoding == NULL)
7190 return -1;
7191
Victor Stinner7d00cc12014-03-17 23:08:06 +01007192 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194 UnicodeDecodeError. */
7195 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196 if (exc != NULL) {
7197 PyCodec_StrictErrors(exc);
7198 Py_CLEAR(exc);
7199 }
7200 goto error;
7201 }
7202
7203 if (*v == NULL) {
7204 /* Create unicode object */
7205 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206 PyErr_NoMemory();
7207 goto error;
7208 }
Victor Stinnerab595942011-12-17 04:59:06 +01007209 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 if (*v == NULL)
7212 goto error;
7213 startout = PyUnicode_AS_UNICODE(*v);
7214 }
7215 else {
7216 /* Extend unicode object */
7217 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219 PyErr_NoMemory();
7220 goto error;
7221 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007222 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 goto error;
7224 startout = PyUnicode_AS_UNICODE(*v) + n;
7225 }
7226
7227 /* Decode the byte string character per character */
7228 out = startout;
7229 while (in < endin)
7230 {
7231 /* Decode a character */
7232 insize = 1;
7233 do
7234 {
7235 outsize = MultiByteToWideChar(code_page, flags,
7236 in, insize,
7237 buffer, Py_ARRAY_LENGTH(buffer));
7238 if (outsize > 0)
7239 break;
7240 err = GetLastError();
7241 if (err != ERROR_NO_UNICODE_TRANSLATION
7242 && err != ERROR_INSUFFICIENT_BUFFER)
7243 {
7244 PyErr_SetFromWindowsErr(0);
7245 goto error;
7246 }
7247 insize++;
7248 }
7249 /* 4=maximum length of a UTF-8 sequence */
7250 while (insize <= 4 && (in + insize) <= endin);
7251
7252 if (outsize <= 0) {
7253 Py_ssize_t startinpos, endinpos, outpos;
7254
Victor Stinner7d00cc12014-03-17 23:08:06 +01007255 /* last character in partial decode? */
7256 if (in + insize >= endin && !final)
7257 break;
7258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 startinpos = in - startin;
7260 endinpos = startinpos + 1;
7261 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007262 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 errors, &errorHandler,
7264 encoding, reason,
7265 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007266 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 {
7268 goto error;
7269 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 }
7272 else {
7273 in += insize;
7274 memcpy(out, buffer, outsize * sizeof(wchar_t));
7275 out += outsize;
7276 }
7277 }
7278
7279 /* write a NUL character at the end */
7280 *out = 0;
7281
7282 /* Extend unicode object */
7283 outsize = out - startout;
7284 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007285 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007287 /* (in - startin) <= size and size is an int */
7288 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007289
7290error:
7291 Py_XDECREF(encoding_obj);
7292 Py_XDECREF(errorHandler);
7293 Py_XDECREF(exc);
7294 return ret;
7295}
7296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297static PyObject *
7298decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 const char *s, Py_ssize_t size,
7300 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301{
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 PyObject *v = NULL;
7303 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 if (code_page < 0) {
7306 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307 return NULL;
7308 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007309 if (size < 0) {
7310 PyErr_BadInternalCall();
7311 return NULL;
7312 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007313
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 do
7318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 if (size > INT_MAX) {
7321 chunk_size = INT_MAX;
7322 final = 0;
7323 done = 0;
7324 }
7325 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 {
7328 chunk_size = (int)size;
7329 final = (consumed == NULL);
7330 done = 1;
7331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 if (chunk_size == 0 && done) {
7334 if (v != NULL)
7335 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007336 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 converted = decode_code_page_strict(code_page, &v,
7340 s, chunk_size);
7341 if (converted == -2)
7342 converted = decode_code_page_errors(code_page, &v,
7343 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007344 errors, final);
7345 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007346
7347 if (converted < 0) {
7348 Py_XDECREF(v);
7349 return NULL;
7350 }
7351
7352 if (consumed)
7353 *consumed += converted;
7354
7355 s += converted;
7356 size -= converted;
7357 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007358
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007359 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360}
7361
Alexander Belopolsky40018472011-02-26 01:02:56 +00007362PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007363PyUnicode_DecodeCodePageStateful(int code_page,
7364 const char *s,
7365 Py_ssize_t size,
7366 const char *errors,
7367 Py_ssize_t *consumed)
7368{
7369 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7370}
7371
7372PyObject *
7373PyUnicode_DecodeMBCSStateful(const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7379}
7380
7381PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382PyUnicode_DecodeMBCS(const char *s,
7383 Py_ssize_t size,
7384 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007385{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7387}
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389static DWORD
7390encode_code_page_flags(UINT code_page, const char *errors)
7391{
7392 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007393 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 }
7395 else if (code_page == CP_UTF7) {
7396 /* CP_UTF7 only supports flags=0 */
7397 return 0;
7398 }
7399 else {
7400 if (errors != NULL && strcmp(errors, "replace") == 0)
7401 return 0;
7402 else
7403 return WC_NO_BEST_FIT_CHARS;
7404 }
7405}
7406
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007407/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 * Encode a Unicode string to a Windows code page into a byte string in strict
7409 * mode.
7410 *
7411 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007412 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007414static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007415encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007416 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418{
Victor Stinner554f3f02010-06-16 23:33:54 +00007419 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 BOOL *pusedDefaultChar = &usedDefaultChar;
7421 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007422 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const DWORD flags = encode_code_page_flags(code_page, NULL);
7425 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 /* Create a substring so that we can get the UTF-16 representation
7427 of just the slice under consideration. */
7428 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007433 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007435 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007436
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 substring = PyUnicode_Substring(unicode, offset, offset+len);
7438 if (substring == NULL)
7439 return -1;
7440 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7441 if (p == NULL) {
7442 Py_DECREF(substring);
7443 return -1;
7444 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007445 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007447 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 NULL, 0,
7451 NULL, pusedDefaultChar);
7452 if (outsize <= 0)
7453 goto error;
7454 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 if (pusedDefaultChar && *pusedDefaultChar) {
7456 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 if (*outbytes == NULL) {
7464 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468 }
7469 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 const Py_ssize_t n = PyBytes_Size(*outbytes);
7472 if (outsize > PY_SSIZE_T_MAX - n) {
7473 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7478 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482 }
7483
7484 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007486 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 out, outsize,
7488 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 if (outsize <= 0)
7491 goto error;
7492 if (pusedDefaultChar && *pusedDefaultChar)
7493 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7499 return -2;
7500 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007501 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007502}
7503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007505 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 * error handler.
7507 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007508 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 * -1 on other error.
7510 */
7511static int
7512encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007513 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007515{
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 Py_ssize_t pos = unicode_offset;
7518 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 /* Ideally, we should get reason from FormatMessage. This is the Windows
7520 2000 English version of the message. */
7521 const char *reason = "invalid character";
7522 /* 4=maximum length of a UTF-8 sequence */
7523 char buffer[4];
7524 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7525 Py_ssize_t outsize;
7526 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 PyObject *errorHandler = NULL;
7528 PyObject *exc = NULL;
7529 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007530 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *rep;
7533 int ret = -1;
7534
7535 assert(insize > 0);
7536
7537 encoding = code_page_name(code_page, &encoding_obj);
7538 if (encoding == NULL)
7539 return -1;
7540
7541 if (errors == NULL || strcmp(errors, "strict") == 0) {
7542 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7543 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007544 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 if (exc != NULL) {
7546 PyCodec_StrictErrors(exc);
7547 Py_DECREF(exc);
7548 }
7549 Py_XDECREF(encoding_obj);
7550 return -1;
7551 }
7552
7553 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7554 pusedDefaultChar = &usedDefaultChar;
7555 else
7556 pusedDefaultChar = NULL;
7557
7558 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7559 PyErr_NoMemory();
7560 goto error;
7561 }
7562 outsize = insize * Py_ARRAY_LENGTH(buffer);
7563
7564 if (*outbytes == NULL) {
7565 /* Create string object */
7566 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7567 if (*outbytes == NULL)
7568 goto error;
7569 out = PyBytes_AS_STRING(*outbytes);
7570 }
7571 else {
7572 /* Extend string object */
7573 Py_ssize_t n = PyBytes_Size(*outbytes);
7574 if (n > PY_SSIZE_T_MAX - outsize) {
7575 PyErr_NoMemory();
7576 goto error;
7577 }
7578 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7579 goto error;
7580 out = PyBytes_AS_STRING(*outbytes) + n;
7581 }
7582
7583 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007584 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7587 wchar_t chars[2];
7588 int charsize;
7589 if (ch < 0x10000) {
7590 chars[0] = (wchar_t)ch;
7591 charsize = 1;
7592 }
7593 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007594 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7595 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007596 charsize = 2;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 buffer, Py_ARRAY_LENGTH(buffer),
7602 NULL, pusedDefaultChar);
7603 if (outsize > 0) {
7604 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7605 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 memcpy(out, buffer, outsize);
7608 out += outsize;
7609 continue;
7610 }
7611 }
7612 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7613 PyErr_SetFromWindowsErr(0);
7614 goto error;
7615 }
7616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 rep = unicode_encode_call_errorhandler(
7618 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007619 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 if (rep == NULL)
7622 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007624
7625 if (PyBytes_Check(rep)) {
7626 outsize = PyBytes_GET_SIZE(rep);
7627 if (outsize != 1) {
7628 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7629 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7630 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7631 Py_DECREF(rep);
7632 goto error;
7633 }
7634 out = PyBytes_AS_STRING(*outbytes) + offset;
7635 }
7636 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7637 out += outsize;
7638 }
7639 else {
7640 Py_ssize_t i;
7641 enum PyUnicode_Kind kind;
7642 void *data;
7643
Benjamin Petersonbac79492012-01-14 13:34:47 -05007644 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 Py_DECREF(rep);
7646 goto error;
7647 }
7648
7649 outsize = PyUnicode_GET_LENGTH(rep);
7650 if (outsize != 1) {
7651 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7652 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7653 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7654 Py_DECREF(rep);
7655 goto error;
7656 }
7657 out = PyBytes_AS_STRING(*outbytes) + offset;
7658 }
7659 kind = PyUnicode_KIND(rep);
7660 data = PyUnicode_DATA(rep);
7661 for (i=0; i < outsize; i++) {
7662 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7663 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007664 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 encoding, unicode,
7666 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 "unable to encode error handler result to ASCII");
7668 Py_DECREF(rep);
7669 goto error;
7670 }
7671 *out = (unsigned char)ch;
7672 out++;
7673 }
7674 }
7675 Py_DECREF(rep);
7676 }
7677 /* write a NUL byte */
7678 *out = 0;
7679 outsize = out - PyBytes_AS_STRING(*outbytes);
7680 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7681 if (_PyBytes_Resize(outbytes, outsize) < 0)
7682 goto error;
7683 ret = 0;
7684
7685error:
7686 Py_XDECREF(encoding_obj);
7687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
7689 return ret;
7690}
7691
Victor Stinner3a50e702011-10-18 21:21:00 +02007692static PyObject *
7693encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007694 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 const char *errors)
7696{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007697 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007701
Victor Stinner29dacf22015-01-26 16:41:32 +01007702 if (!PyUnicode_Check(unicode)) {
7703 PyErr_BadArgument();
7704 return NULL;
7705 }
7706
Benjamin Petersonbac79492012-01-14 13:34:47 -05007707 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007708 return NULL;
7709 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 if (code_page < 0) {
7712 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7713 return NULL;
7714 }
7715
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 return PyBytes_FromStringAndSize(NULL, 0);
7718
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 offset = 0;
7720 do
7721 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007722#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007723 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunks. */
7725 if (len > INT_MAX/2) {
7726 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 done = 0;
7728 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007730#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 done = 1;
7734 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007735
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 errors);
7739 if (ret == -2)
7740 ret = encode_code_page_errors(code_page, &outbytes,
7741 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 if (ret < 0) {
7744 Py_XDECREF(outbytes);
7745 return NULL;
7746 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007747
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007749 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007750 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 return outbytes;
7753}
7754
7755PyObject *
7756PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7757 Py_ssize_t size,
7758 const char *errors)
7759{
Victor Stinner7581cef2011-11-03 22:32:33 +01007760 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007761 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 if (unicode == NULL)
7763 return NULL;
7764 res = encode_code_page(CP_ACP, unicode, errors);
7765 Py_DECREF(unicode);
7766 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767}
7768
7769PyObject *
7770PyUnicode_EncodeCodePage(int code_page,
7771 PyObject *unicode,
7772 const char *errors)
7773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007775}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007776
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777PyObject *
7778PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007779{
Victor Stinner7581cef2011-11-03 22:32:33 +01007780 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007781}
7782
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007783#undef NEED_RETRY
7784
Steve Dowercc16be82016-09-08 10:35:16 -07007785#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787/* --- Character Mapping Codec -------------------------------------------- */
7788
Victor Stinnerfb161b12013-04-18 01:44:27 +02007789static int
7790charmap_decode_string(const char *s,
7791 Py_ssize_t size,
7792 PyObject *mapping,
7793 const char *errors,
7794 _PyUnicodeWriter *writer)
7795{
7796 const char *starts = s;
7797 const char *e;
7798 Py_ssize_t startinpos, endinpos;
7799 PyObject *errorHandler = NULL, *exc = NULL;
7800 Py_ssize_t maplen;
7801 enum PyUnicode_Kind mapkind;
7802 void *mapdata;
7803 Py_UCS4 x;
7804 unsigned char ch;
7805
7806 if (PyUnicode_READY(mapping) == -1)
7807 return -1;
7808
7809 maplen = PyUnicode_GET_LENGTH(mapping);
7810 mapdata = PyUnicode_DATA(mapping);
7811 mapkind = PyUnicode_KIND(mapping);
7812
7813 e = s + size;
7814
7815 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7816 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7817 * is disabled in encoding aliases, latin1 is preferred because
7818 * its implementation is faster. */
7819 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7820 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7821 Py_UCS4 maxchar = writer->maxchar;
7822
7823 assert (writer->kind == PyUnicode_1BYTE_KIND);
7824 while (s < e) {
7825 ch = *s;
7826 x = mapdata_ucs1[ch];
7827 if (x > maxchar) {
7828 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7829 goto onError;
7830 maxchar = writer->maxchar;
7831 outdata = (Py_UCS1 *)writer->data;
7832 }
7833 outdata[writer->pos] = x;
7834 writer->pos++;
7835 ++s;
7836 }
7837 return 0;
7838 }
7839
7840 while (s < e) {
7841 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7842 enum PyUnicode_Kind outkind = writer->kind;
7843 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7844 if (outkind == PyUnicode_1BYTE_KIND) {
7845 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7846 Py_UCS4 maxchar = writer->maxchar;
7847 while (s < e) {
7848 ch = *s;
7849 x = mapdata_ucs2[ch];
7850 if (x > maxchar)
7851 goto Error;
7852 outdata[writer->pos] = x;
7853 writer->pos++;
7854 ++s;
7855 }
7856 break;
7857 }
7858 else if (outkind == PyUnicode_2BYTE_KIND) {
7859 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7860 while (s < e) {
7861 ch = *s;
7862 x = mapdata_ucs2[ch];
7863 if (x == 0xFFFE)
7864 goto Error;
7865 outdata[writer->pos] = x;
7866 writer->pos++;
7867 ++s;
7868 }
7869 break;
7870 }
7871 }
7872 ch = *s;
7873
7874 if (ch < maplen)
7875 x = PyUnicode_READ(mapkind, mapdata, ch);
7876 else
7877 x = 0xfffe; /* invalid value */
7878Error:
7879 if (x == 0xfffe)
7880 {
7881 /* undefined mapping */
7882 startinpos = s-starts;
7883 endinpos = startinpos+1;
7884 if (unicode_decode_call_errorhandler_writer(
7885 errors, &errorHandler,
7886 "charmap", "character maps to <undefined>",
7887 &starts, &e, &startinpos, &endinpos, &exc, &s,
7888 writer)) {
7889 goto onError;
7890 }
7891 continue;
7892 }
7893
7894 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7895 goto onError;
7896 ++s;
7897 }
7898 Py_XDECREF(errorHandler);
7899 Py_XDECREF(exc);
7900 return 0;
7901
7902onError:
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return -1;
7906}
7907
7908static int
7909charmap_decode_mapping(const char *s,
7910 Py_ssize_t size,
7911 PyObject *mapping,
7912 const char *errors,
7913 _PyUnicodeWriter *writer)
7914{
7915 const char *starts = s;
7916 const char *e;
7917 Py_ssize_t startinpos, endinpos;
7918 PyObject *errorHandler = NULL, *exc = NULL;
7919 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007920 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007921
7922 e = s + size;
7923
7924 while (s < e) {
7925 ch = *s;
7926
7927 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7928 key = PyLong_FromLong((long)ch);
7929 if (key == NULL)
7930 goto onError;
7931
7932 item = PyObject_GetItem(mapping, key);
7933 Py_DECREF(key);
7934 if (item == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 goto Undefined;
7939 } else
7940 goto onError;
7941 }
7942
7943 /* Apply mapping */
7944 if (item == Py_None)
7945 goto Undefined;
7946 if (PyLong_Check(item)) {
7947 long value = PyLong_AS_LONG(item);
7948 if (value == 0xFFFE)
7949 goto Undefined;
7950 if (value < 0 || value > MAX_UNICODE) {
7951 PyErr_Format(PyExc_TypeError,
7952 "character mapping must be in range(0x%lx)",
7953 (unsigned long)MAX_UNICODE + 1);
7954 goto onError;
7955 }
7956
7957 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7958 goto onError;
7959 }
7960 else if (PyUnicode_Check(item)) {
7961 if (PyUnicode_READY(item) == -1)
7962 goto onError;
7963 if (PyUnicode_GET_LENGTH(item) == 1) {
7964 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7965 if (value == 0xFFFE)
7966 goto Undefined;
7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968 goto onError;
7969 }
7970 else {
7971 writer->overallocate = 1;
7972 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7973 goto onError;
7974 }
7975 }
7976 else {
7977 /* wrong return value */
7978 PyErr_SetString(PyExc_TypeError,
7979 "character mapping must return integer, None or str");
7980 goto onError;
7981 }
7982 Py_CLEAR(item);
7983 ++s;
7984 continue;
7985
7986Undefined:
7987 /* undefined mapping */
7988 Py_CLEAR(item);
7989 startinpos = s-starts;
7990 endinpos = startinpos+1;
7991 if (unicode_decode_call_errorhandler_writer(
7992 errors, &errorHandler,
7993 "charmap", "character maps to <undefined>",
7994 &starts, &e, &startinpos, &endinpos, &exc, &s,
7995 writer)) {
7996 goto onError;
7997 }
7998 }
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return 0;
8002
8003onError:
8004 Py_XDECREF(item);
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return -1;
8008}
8009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010PyObject *
8011PyUnicode_DecodeCharmap(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008016 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008017
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 /* Default to Latin-1 */
8019 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008023 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008024 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008025 writer.min_length = size;
8026 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008028
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008029 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008030 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8031 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008032 }
8033 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008038
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008040 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 return NULL;
8042}
8043
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044/* Charmap encoding: the lookup table */
8045
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 PyObject_HEAD
8048 unsigned char level1[32];
8049 int count2, count3;
8050 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051};
8052
8053static PyObject*
8054encoding_map_size(PyObject *obj, PyObject* args)
8055{
8056 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059}
8060
8061static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 PyDoc_STR("Return the size (in bytes) of this object") },
8064 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065};
8066
8067static void
8068encoding_map_dealloc(PyObject* o)
8069{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071}
8072
8073static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "EncodingMap", /*tp_name*/
8076 sizeof(struct encoding_map), /*tp_basicsize*/
8077 0, /*tp_itemsize*/
8078 /* methods */
8079 encoding_map_dealloc, /*tp_dealloc*/
8080 0, /*tp_print*/
8081 0, /*tp_getattr*/
8082 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008083 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 0, /*tp_repr*/
8085 0, /*tp_as_number*/
8086 0, /*tp_as_sequence*/
8087 0, /*tp_as_mapping*/
8088 0, /*tp_hash*/
8089 0, /*tp_call*/
8090 0, /*tp_str*/
8091 0, /*tp_getattro*/
8092 0, /*tp_setattro*/
8093 0, /*tp_as_buffer*/
8094 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8095 0, /*tp_doc*/
8096 0, /*tp_traverse*/
8097 0, /*tp_clear*/
8098 0, /*tp_richcompare*/
8099 0, /*tp_weaklistoffset*/
8100 0, /*tp_iter*/
8101 0, /*tp_iternext*/
8102 encoding_map_methods, /*tp_methods*/
8103 0, /*tp_members*/
8104 0, /*tp_getset*/
8105 0, /*tp_base*/
8106 0, /*tp_dict*/
8107 0, /*tp_descr_get*/
8108 0, /*tp_descr_set*/
8109 0, /*tp_dictoffset*/
8110 0, /*tp_init*/
8111 0, /*tp_alloc*/
8112 0, /*tp_new*/
8113 0, /*tp_free*/
8114 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115};
8116
8117PyObject*
8118PyUnicode_BuildEncodingMap(PyObject* string)
8119{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyObject *result;
8121 struct encoding_map *mresult;
8122 int i;
8123 int need_dict = 0;
8124 unsigned char level1[32];
8125 unsigned char level2[512];
8126 unsigned char *mlevel1, *mlevel2, *mlevel3;
8127 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 int kind;
8129 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008130 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008133 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 PyErr_BadArgument();
8135 return NULL;
8136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 kind = PyUnicode_KIND(string);
8138 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 length = PyUnicode_GET_LENGTH(string);
8140 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 memset(level1, 0xFF, sizeof level1);
8142 memset(level2, 0xFF, sizeof level2);
8143
8144 /* If there isn't a one-to-one mapping of NULL to \0,
8145 or if there are non-BMP characters, we need to use
8146 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008149 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 ch = PyUnicode_READ(kind, data, i);
8152 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 need_dict = 1;
8154 break;
8155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 /* unmapped character */
8158 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 l1 = ch >> 11;
8160 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (level1[l1] == 0xFF)
8162 level1[l1] = count2++;
8163 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 }
8166
8167 if (count2 >= 0xFF || count3 >= 0xFF)
8168 need_dict = 1;
8169
8170 if (need_dict) {
8171 PyObject *result = PyDict_New();
8172 PyObject *key, *value;
8173 if (!result)
8174 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008175 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008177 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 if (!key || !value)
8179 goto failed1;
8180 if (PyDict_SetItem(result, key, value) == -1)
8181 goto failed1;
8182 Py_DECREF(key);
8183 Py_DECREF(value);
8184 }
8185 return result;
8186 failed1:
8187 Py_XDECREF(key);
8188 Py_XDECREF(value);
8189 Py_DECREF(result);
8190 return NULL;
8191 }
8192
8193 /* Create a three-level trie */
8194 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8195 16*count2 + 128*count3 - 1);
8196 if (!result)
8197 return PyErr_NoMemory();
8198 PyObject_Init(result, &EncodingMapType);
8199 mresult = (struct encoding_map*)result;
8200 mresult->count2 = count2;
8201 mresult->count3 = count3;
8202 mlevel1 = mresult->level1;
8203 mlevel2 = mresult->level23;
8204 mlevel3 = mresult->level23 + 16*count2;
8205 memcpy(mlevel1, level1, 32);
8206 memset(mlevel2, 0xFF, 16*count2);
8207 memset(mlevel3, 0, 128*count3);
8208 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008209 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008211 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8212 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 /* unmapped character */
8214 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 o1 = ch>>11;
8216 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 i2 = 16*mlevel1[o1] + o2;
8218 if (mlevel2[i2] == 0xFF)
8219 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i3 = 128*mlevel2[i2] + o3;
8222 mlevel3[i3] = i;
8223 }
8224 return result;
8225}
8226
8227static int
Victor Stinner22168992011-11-20 17:09:18 +01008228encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229{
8230 struct encoding_map *map = (struct encoding_map*)mapping;
8231 int l1 = c>>11;
8232 int l2 = (c>>7) & 0xF;
8233 int l3 = c & 0x7F;
8234 int i;
8235
Victor Stinner22168992011-11-20 17:09:18 +01008236 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 if (c == 0)
8239 return 0;
8240 /* level 1*/
8241 i = map->level1[l1];
8242 if (i == 0xFF) {
8243 return -1;
8244 }
8245 /* level 2*/
8246 i = map->level23[16*i+l2];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 3 */
8251 i = map->level23[16*map->count2 + 128*i + l3];
8252 if (i == 0) {
8253 return -1;
8254 }
8255 return i;
8256}
8257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258/* Lookup the character ch in the mapping. If the character
8259 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008260 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008262charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263{
Christian Heimes217cfd12007-12-02 14:31:20 +00008264 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 PyObject *x;
8266
8267 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 x = PyObject_GetItem(mapping, w);
8270 Py_DECREF(w);
8271 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8273 /* No mapping found means: mapping is undefined. */
8274 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008275 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 } else
8277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008279 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008281 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 long value = PyLong_AS_LONG(x);
8283 if (value < 0 || value > 255) {
8284 PyErr_SetString(PyExc_TypeError,
8285 "character mapping must be in range(256)");
8286 Py_DECREF(x);
8287 return NULL;
8288 }
8289 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008291 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 /* wrong return value */
8295 PyErr_Format(PyExc_TypeError,
8296 "character mapping must return integer, bytes or None, not %.400s",
8297 x->ob_type->tp_name);
8298 Py_DECREF(x);
8299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
8301}
8302
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008304charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8307 /* exponentially overallocate to minimize reallocations */
8308 if (requiredsize < 2*outsize)
8309 requiredsize = 2*outsize;
8310 if (_PyBytes_Resize(outobj, requiredsize))
8311 return -1;
8312 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313}
8314
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008319 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 space is available. Return a new reference to the object that
8321 was put in the output buffer, or Py_None, if the mapping was undefined
8322 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008323 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008325charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 PyObject *rep;
8329 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008330 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331
Christian Heimes90aa7642007-12-19 02:45:37 +00008332 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 if (res == -1)
8336 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if (outsize<requiredsize)
8338 if (charmapencode_resize(outobj, outpos, requiredsize))
8339 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008340 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 outstart[(*outpos)++] = (char)res;
8342 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343 }
8344
8345 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 Py_DECREF(rep);
8350 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008351 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 if (PyLong_Check(rep)) {
8353 Py_ssize_t requiredsize = *outpos+1;
8354 if (outsize<requiredsize)
8355 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8356 Py_DECREF(rep);
8357 return enc_EXCEPTION;
8358 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008359 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 else {
8363 const char *repchars = PyBytes_AS_STRING(rep);
8364 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8365 Py_ssize_t requiredsize = *outpos+repsize;
8366 if (outsize<requiredsize)
8367 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8368 Py_DECREF(rep);
8369 return enc_EXCEPTION;
8370 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008371 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 memcpy(outstart + *outpos, repchars, repsize);
8373 *outpos += repsize;
8374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 Py_DECREF(rep);
8377 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378}
8379
8380/* handle an error in PyUnicode_EncodeCharmap
8381 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008382static int
8383charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008386 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008387 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388{
8389 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008391 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008392 enum PyUnicode_Kind kind;
8393 void *data;
8394 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 Py_ssize_t collstartpos = *inpos;
8397 Py_ssize_t collendpos = *inpos+1;
8398 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 char *encoding = "charmap";
8400 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008401 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008403 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Benjamin Petersonbac79492012-01-14 13:34:47 -05008405 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 return -1;
8407 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* find all unencodable characters */
8409 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008411 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008413 val = encoding_map_lookup(ch, mapping);
8414 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 break;
8416 ++collendpos;
8417 continue;
8418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8421 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 if (rep==NULL)
8423 return -1;
8424 else if (rep!=Py_None) {
8425 Py_DECREF(rep);
8426 break;
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431 /* cache callback name lookup
8432 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008433 if (*error_handler == _Py_ERROR_UNKNOWN)
8434 *error_handler = get_error_handler(errors);
8435
8436 switch (*error_handler) {
8437 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008438 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008440
8441 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 x = charmapencode_output('?', mapping, res, respos);
8444 if (x==enc_EXCEPTION) {
8445 return -1;
8446 }
8447 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008448 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
8450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 }
8452 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008453 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 *inpos = collendpos;
8455 break;
Victor Stinner50149202015-09-22 00:26:54 +02008456
8457 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 /* generate replacement (temporarily (mis)uses p) */
8459 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 char buffer[2+29+1+1];
8461 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 for (cp = buffer; *cp; ++cp) {
8464 x = charmapencode_output(*cp, mapping, res, respos);
8465 if (x==enc_EXCEPTION)
8466 return -1;
8467 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008468 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
8470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 }
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 *inpos = collendpos;
8474 break;
Victor Stinner50149202015-09-22 00:26:54 +02008475
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 default:
Victor Stinner50149202015-09-22 00:26:54 +02008477 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008478 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008482 if (PyBytes_Check(repunicode)) {
8483 /* Directly copy bytes result to output. */
8484 Py_ssize_t outsize = PyBytes_Size(*res);
8485 Py_ssize_t requiredsize;
8486 repsize = PyBytes_Size(repunicode);
8487 requiredsize = *respos + repsize;
8488 if (requiredsize > outsize)
8489 /* Make room for all additional bytes. */
8490 if (charmapencode_resize(res, respos, requiredsize)) {
8491 Py_DECREF(repunicode);
8492 return -1;
8493 }
8494 memcpy(PyBytes_AsString(*res) + *respos,
8495 PyBytes_AsString(repunicode), repsize);
8496 *respos += repsize;
8497 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008498 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008499 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008500 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008502 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 Py_DECREF(repunicode);
8504 return -1;
8505 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008506 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008507 data = PyUnicode_DATA(repunicode);
8508 kind = PyUnicode_KIND(repunicode);
8509 for (index = 0; index < repsize; index++) {
8510 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8511 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008513 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return -1;
8515 }
8516 else if (x==enc_FAILED) {
8517 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008518 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return -1;
8520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008521 }
8522 *inpos = newpos;
8523 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 }
8525 return 0;
8526}
8527
Alexander Belopolsky40018472011-02-26 01:02:56 +00008528PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529_PyUnicode_EncodeCharmap(PyObject *unicode,
8530 PyObject *mapping,
8531 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 /* output object */
8534 PyObject *res = NULL;
8535 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008537 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008540 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008542 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008543 void *data;
8544 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Benjamin Petersonbac79492012-01-14 13:34:47 -05008546 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547 return NULL;
8548 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008549 data = PyUnicode_DATA(unicode);
8550 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 /* Default to Latin-1 */
8553 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 /* allocate enough for a simple encoding without
8557 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008558 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 if (res == NULL)
8560 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008561 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008565 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (x==enc_EXCEPTION) /* error */
8569 goto onError;
8570 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008573 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 &res, &respos)) {
8575 goto onError;
8576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008577 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 else
8579 /* done with this character => adjust input position */
8580 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008584 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008585 if (_PyBytes_Resize(&res, respos) < 0)
8586 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008589 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 return res;
8591
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 Py_XDECREF(res);
8594 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008595 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 return NULL;
8597}
8598
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008599/* Deprecated */
8600PyObject *
8601PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8602 Py_ssize_t size,
8603 PyObject *mapping,
8604 const char *errors)
8605{
8606 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008607 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 if (unicode == NULL)
8609 return NULL;
8610 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8611 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008612 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613}
8614
Alexander Belopolsky40018472011-02-26 01:02:56 +00008615PyObject *
8616PyUnicode_AsCharmapString(PyObject *unicode,
8617 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
8619 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 PyErr_BadArgument();
8621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008623 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624}
8625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627static void
8628make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630 Py_ssize_t startpos, Py_ssize_t endpos,
8631 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 *exceptionObject = _PyUnicodeTranslateError_Create(
8635 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 }
8637 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8639 goto onError;
8640 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8641 goto onError;
8642 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8643 goto onError;
8644 return;
8645 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008646 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 }
8648}
8649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650/* error handling callback helper:
8651 build arguments, call the callback and check the arguments,
8652 put the result into newpos and return the replacement string, which
8653 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654static PyObject *
8655unicode_translate_call_errorhandler(const char *errors,
8656 PyObject **errorHandler,
8657 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659 Py_ssize_t startpos, Py_ssize_t endpos,
8660 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008662 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008664 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 PyObject *restuple;
8666 PyObject *resunicode;
8667
8668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 }
8673
8674 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008679 restuple = PyObject_CallFunctionObjArgs(
8680 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 Py_DECREF(restuple);
8686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008688 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 &resunicode, &i_newpos)) {
8690 Py_DECREF(restuple);
8691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 else
8696 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008698 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 Py_DECREF(restuple);
8700 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 Py_INCREF(resunicode);
8703 Py_DECREF(restuple);
8704 return resunicode;
8705}
8706
8707/* Lookup the character ch in the mapping and put the result in result,
8708 which must be decrefed by the caller.
8709 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008710static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712{
Christian Heimes217cfd12007-12-02 14:31:20 +00008713 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 PyObject *x;
8715
8716 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 x = PyObject_GetItem(mapping, w);
8719 Py_DECREF(w);
8720 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8722 /* No mapping found means: use 1:1 mapping. */
8723 PyErr_Clear();
8724 *result = NULL;
8725 return 0;
8726 } else
8727 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
8729 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 *result = x;
8731 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008733 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008735 if (value < 0 || value > MAX_UNICODE) {
8736 PyErr_Format(PyExc_ValueError,
8737 "character mapping must be in range(0x%x)",
8738 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(x);
8740 return -1;
8741 }
8742 *result = x;
8743 return 0;
8744 }
8745 else if (PyUnicode_Check(x)) {
8746 *result = x;
8747 return 0;
8748 }
8749 else {
8750 /* wrong return value */
8751 PyErr_SetString(PyExc_TypeError,
8752 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008753 Py_DECREF(x);
8754 return -1;
8755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756}
Victor Stinner1194ea02014-04-04 19:37:40 +02008757
8758/* lookup the character, write the result into the writer.
8759 Return 1 if the result was written into the writer, return 0 if the mapping
8760 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008762charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8763 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764{
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 PyObject *item;
8766
8767 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008769
8770 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008775 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008777
8778 if (item == Py_None) {
8779 Py_DECREF(item);
8780 return 0;
8781 }
8782
8783 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008784 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8785 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8786 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008787 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8788 Py_DECREF(item);
8789 return -1;
8790 }
8791 Py_DECREF(item);
8792 return 1;
8793 }
8794
8795 if (!PyUnicode_Check(item)) {
8796 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008798 }
8799
8800 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8801 Py_DECREF(item);
8802 return -1;
8803 }
8804
8805 Py_DECREF(item);
8806 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807}
8808
Victor Stinner89a76ab2014-04-05 11:44:04 +02008809static int
8810unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8811 Py_UCS1 *translate)
8812{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008813 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814 int ret = 0;
8815
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 if (charmaptranslate_lookup(ch, mapping, &item)) {
8817 return -1;
8818 }
8819
8820 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008821 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008822 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008824 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 /* not found => default to 1:1 mapping */
8826 translate[ch] = ch;
8827 return 1;
8828 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008829 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008830 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008831 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8832 used it */
8833 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 /* invalid character or character outside ASCII:
8835 skip the fast translate */
8836 goto exit;
8837 }
8838 translate[ch] = (Py_UCS1)replace;
8839 }
8840 else if (PyUnicode_Check(item)) {
8841 Py_UCS4 replace;
8842
8843 if (PyUnicode_READY(item) == -1) {
8844 Py_DECREF(item);
8845 return -1;
8846 }
8847 if (PyUnicode_GET_LENGTH(item) != 1)
8848 goto exit;
8849
8850 replace = PyUnicode_READ_CHAR(item, 0);
8851 if (replace > 127)
8852 goto exit;
8853 translate[ch] = (Py_UCS1)replace;
8854 }
8855 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008856 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 goto exit;
8858 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 ret = 1;
8860
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 exit:
8862 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 return ret;
8864}
8865
8866/* Fast path for ascii => ascii translation. Return 1 if the whole string
8867 was translated into writer, return 0 if the input string was partially
8868 translated into writer, raise an exception and return -1 on error. */
8869static int
8870unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008871 _PyUnicodeWriter *writer, int ignore,
8872 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008873{
Victor Stinner872b2912014-04-05 14:27:07 +02008874 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 Py_ssize_t len;
8876 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008877 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 len = PyUnicode_GET_LENGTH(input);
8880
Victor Stinner872b2912014-04-05 14:27:07 +02008881 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882
8883 in = PyUnicode_1BYTE_DATA(input);
8884 end = in + len;
8885
8886 assert(PyUnicode_IS_ASCII(writer->buffer));
8887 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8888 out = PyUnicode_1BYTE_DATA(writer->buffer);
8889
Victor Stinner872b2912014-04-05 14:27:07 +02008890 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008892 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008894 int translate = unicode_fast_translate_lookup(mapping, ch,
8895 ascii_table);
8896 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008898 if (translate == 0)
8899 goto exit;
8900 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 }
Victor Stinner872b2912014-04-05 14:27:07 +02008902 if (ch2 == 0xfe) {
8903 if (ignore)
8904 continue;
8905 goto exit;
8906 }
8907 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008909 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 }
Victor Stinner872b2912014-04-05 14:27:07 +02008911 res = 1;
8912
8913exit:
8914 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008915 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008916 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917}
8918
Victor Stinner3222da22015-10-01 22:07:32 +02008919static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920_PyUnicode_TranslateCharmap(PyObject *input,
8921 PyObject *mapping,
8922 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008925 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 Py_ssize_t size, i;
8927 int kind;
8928 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 _PyUnicodeWriter writer;
8930 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 char *reason = "character maps to <undefined>";
8932 PyObject *errorHandler = NULL;
8933 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008935 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 PyErr_BadArgument();
8939 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (PyUnicode_READY(input) == -1)
8943 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008944 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 kind = PyUnicode_KIND(input);
8946 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008948 if (size == 0)
8949 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008951 /* allocate enough for a simple 1:1 translation without
8952 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 _PyUnicodeWriter_Init(&writer);
8954 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956
Victor Stinner872b2912014-04-05 14:27:07 +02008957 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8958
Victor Stinner33798672016-03-01 21:59:58 +01008959 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008960 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008961 if (PyUnicode_IS_ASCII(input)) {
8962 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8963 if (res < 0) {
8964 _PyUnicodeWriter_Dealloc(&writer);
8965 return NULL;
8966 }
8967 if (res == 1)
8968 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 }
Victor Stinner33798672016-03-01 21:59:58 +01008970 else {
8971 i = 0;
8972 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008976 int translate;
8977 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8978 Py_ssize_t newpos;
8979 /* startpos for collecting untranslatable chars */
8980 Py_ssize_t collstart;
8981 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 ch = PyUnicode_READ(kind, data, i);
8985 translate = charmaptranslate_output(ch, mapping, &writer);
8986 if (translate < 0)
8987 goto onError;
8988
8989 if (translate != 0) {
8990 /* it worked => adjust input pointer */
8991 ++i;
8992 continue;
8993 }
8994
8995 /* untranslatable character */
8996 collstart = i;
8997 collend = i+1;
8998
8999 /* find all untranslatable characters */
9000 while (collend < size) {
9001 PyObject *x;
9002 ch = PyUnicode_READ(kind, data, collend);
9003 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009004 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 Py_XDECREF(x);
9006 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009008 ++collend;
9009 }
9010
9011 if (ignore) {
9012 i = collend;
9013 }
9014 else {
9015 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9016 reason, input, &exc,
9017 collstart, collend, &newpos);
9018 if (repunicode == NULL)
9019 goto onError;
9020 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009022 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009023 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 Py_DECREF(repunicode);
9025 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009026 }
9027 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009028 Py_XDECREF(exc);
9029 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009030 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009033 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034 Py_XDECREF(exc);
9035 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 return NULL;
9037}
9038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039/* Deprecated. Use PyUnicode_Translate instead. */
9040PyObject *
9041PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9042 Py_ssize_t size,
9043 PyObject *mapping,
9044 const char *errors)
9045{
Christian Heimes5f520f42012-09-11 14:03:25 +02009046 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009047 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 if (!unicode)
9049 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009050 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9051 Py_DECREF(unicode);
9052 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053}
9054
Alexander Belopolsky40018472011-02-26 01:02:56 +00009055PyObject *
9056PyUnicode_Translate(PyObject *str,
9057 PyObject *mapping,
9058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009060 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009061 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009062 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063}
Tim Petersced69f82003-09-16 20:30:58 +00009064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009066fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067{
9068 /* No need to call PyUnicode_READY(self) because this function is only
9069 called as a callback from fixup() which does it already. */
9070 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9071 const int kind = PyUnicode_KIND(self);
9072 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009073 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009074 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 Py_ssize_t i;
9076
9077 for (i = 0; i < len; ++i) {
9078 ch = PyUnicode_READ(kind, data, i);
9079 fixed = 0;
9080 if (ch > 127) {
9081 if (Py_UNICODE_ISSPACE(ch))
9082 fixed = ' ';
9083 else {
9084 const int decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0)
9086 fixed = '0' + decimal;
9087 }
9088 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009089 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009090 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 PyUnicode_WRITE(kind, data, i, fixed);
9092 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009093 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009094 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 }
9097
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009098 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099}
9100
9101PyObject *
9102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9103{
9104 if (!PyUnicode_Check(unicode)) {
9105 PyErr_BadInternalCall();
9106 return NULL;
9107 }
9108 if (PyUnicode_READY(unicode) == -1)
9109 return NULL;
9110 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9111 /* If the string is already ASCII, just return the same string */
9112 Py_INCREF(unicode);
9113 return unicode;
9114 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009115 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116}
9117
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118PyObject *
9119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9120 Py_ssize_t length)
9121{
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 Py_UCS4 maxchar;
9125 enum PyUnicode_Kind kind;
9126 void *data;
9127
Victor Stinner99d7ad02012-02-22 13:37:39 +01009128 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009130 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009134 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009135 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 }
9137 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009138
9139 /* Copy to a new string */
9140 decimal = PyUnicode_New(length, maxchar);
9141 if (decimal == NULL)
9142 return decimal;
9143 kind = PyUnicode_KIND(decimal);
9144 data = PyUnicode_DATA(decimal);
9145 /* Iterate over code points */
9146 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009147 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009148 if (ch > 127) {
9149 int decimal = Py_UNICODE_TODECIMAL(ch);
9150 if (decimal >= 0)
9151 ch = '0' + decimal;
9152 }
9153 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009155 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157/* --- Decimal Encoder ---------------------------------------------------- */
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159int
9160PyUnicode_EncodeDecimal(Py_UNICODE *s,
9161 Py_ssize_t length,
9162 char *output,
9163 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009164{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009165 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009166 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009167 enum PyUnicode_Kind kind;
9168 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169
9170 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 PyErr_BadArgument();
9172 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173 }
9174
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009175 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009176 if (unicode == NULL)
9177 return -1;
9178
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009293 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009311 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329{
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009331 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358
Benjamin Petersonead6b532011-12-20 17:23:42 -06009359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
9384 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009385 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009387 if (unicode != NULL && thousands_sep_kind != kind) {
9388 if (thousands_sep_kind < kind)
9389 PyMem_Free(thousands_sep_data);
9390 else
9391 PyMem_Free(data);
9392 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 if (unicode == NULL) {
9394 *maxchar = 127;
9395 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009396 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009397 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
9399 }
9400 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401}
9402
9403
Alexander Belopolsky40018472011-02-26 01:02:56 +00009404Py_ssize_t
9405PyUnicode_Count(PyObject *str,
9406 PyObject *substr,
9407 Py_ssize_t start,
9408 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009410 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 void *buf1 = NULL, *buf2 = NULL;
9413 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009417
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 kind1 = PyUnicode_KIND(str);
9419 kind2 = PyUnicode_KIND(substr);
9420 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 len1 = PyUnicode_GET_LENGTH(str);
9424 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009426 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009429 buf1 = PyUnicode_DATA(str);
9430 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009432 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (!buf2)
9434 goto onError;
9435 }
9436
9437 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009440 result = asciilib_count(
9441 ((Py_UCS1*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 else
9445 result = ucs1lib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 result = ucs2lib_count(
9452 ((Py_UCS2*)buf1) + start, end - start,
9453 buf2, len2, PY_SSIZE_T_MAX
9454 );
9455 break;
9456 case PyUnicode_4BYTE_KIND:
9457 result = ucs4lib_count(
9458 ((Py_UCS4*)buf1) + start, end - start,
9459 buf2, len2, PY_SSIZE_T_MAX
9460 );
9461 break;
9462 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009463 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009466 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 PyMem_Free(buf2);
9468
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Alexander Belopolsky40018472011-02-26 01:02:56 +00009476Py_ssize_t
9477PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009479 Py_ssize_t start,
9480 Py_ssize_t end,
9481 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009485
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009486 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487}
9488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489Py_ssize_t
9490PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9491 Py_ssize_t start, Py_ssize_t end,
9492 int direction)
9493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009495 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 if (PyUnicode_READY(str) == -1)
9497 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009498 len = PyUnicode_GET_LENGTH(str);
9499 ADJUST_INDICES(start, end, len);
9500 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009501 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009503 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9504 kind, end-start, ch, direction);
9505 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009507 else
9508 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009512tailmatch(PyObject *self,
9513 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009514 Py_ssize_t start,
9515 Py_ssize_t end,
9516 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 int kind_self;
9519 int kind_sub;
9520 void *data_self;
9521 void *data_sub;
9522 Py_ssize_t offset;
9523 Py_ssize_t i;
9524 Py_ssize_t end_sub;
9525
9526 if (PyUnicode_READY(self) == -1 ||
9527 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009528 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9531 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009535 if (PyUnicode_GET_LENGTH(substring) == 0)
9536 return 1;
9537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 kind_self = PyUnicode_KIND(self);
9539 data_self = PyUnicode_DATA(self);
9540 kind_sub = PyUnicode_KIND(substring);
9541 data_sub = PyUnicode_DATA(substring);
9542 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9543
9544 if (direction > 0)
9545 offset = end;
9546 else
9547 offset = start;
9548
9549 if (PyUnicode_READ(kind_self, data_self, offset) ==
9550 PyUnicode_READ(kind_sub, data_sub, 0) &&
9551 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9552 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9553 /* If both are of the same kind, memcmp is sufficient */
9554 if (kind_self == kind_sub) {
9555 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009556 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 data_sub,
9558 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009559 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009561 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 else {
9563 /* We do not need to compare 0 and len(substring)-1 because
9564 the if statement above ensured already that they are equal
9565 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 for (i = 1; i < end_sub; ++i) {
9567 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9568 PyUnicode_READ(kind_sub, data_sub, i))
9569 return 0;
9570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 }
9574
9575 return 0;
9576}
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578Py_ssize_t
9579PyUnicode_Tailmatch(PyObject *str,
9580 PyObject *substr,
9581 Py_ssize_t start,
9582 Py_ssize_t end,
9583 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589}
9590
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591/* Apply fixfct filter to the Unicode object self and return a
9592 reference to the modified object */
9593
Alexander Belopolsky40018472011-02-26 01:02:56 +00009594static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009595fixup(PyObject *self,
9596 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 PyObject *u;
9599 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009600 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009602 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009605 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 /* fix functions return the new maximum character in a string,
9608 if the kind of the resulting unicode object does not change,
9609 everything is fine. Otherwise we need to change the string kind
9610 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009611 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009612
9613 if (maxchar_new == 0) {
9614 /* no changes */;
9615 if (PyUnicode_CheckExact(self)) {
9616 Py_DECREF(u);
9617 Py_INCREF(self);
9618 return self;
9619 }
9620 else
9621 return u;
9622 }
9623
Victor Stinnere6abb482012-05-02 01:15:40 +02009624 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625
Victor Stinnereaab6042011-12-11 22:22:39 +01009626 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009628
9629 /* In case the maximum character changed, we need to
9630 convert the string to the new category. */
9631 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9632 if (v == NULL) {
9633 Py_DECREF(u);
9634 return NULL;
9635 }
9636 if (maxchar_new > maxchar_old) {
9637 /* If the maxchar increased so that the kind changed, not all
9638 characters are representable anymore and we need to fix the
9639 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009640 _PyUnicode_FastCopyCharacters(v, 0,
9641 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009642 maxchar_old = fixfct(v);
9643 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 }
9645 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009646 _PyUnicode_FastCopyCharacters(v, 0,
9647 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009649 Py_DECREF(u);
9650 assert(_PyUnicode_CheckConsistency(v, 1));
9651 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654static PyObject *
9655ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9658 char *resdata, *data = PyUnicode_DATA(self);
9659 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009660
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 res = PyUnicode_New(len, 127);
9662 if (res == NULL)
9663 return NULL;
9664 resdata = PyUnicode_DATA(res);
9665 if (lower)
9666 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 _Py_bytes_upper(resdata, data, len);
9669 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
9671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 Py_ssize_t j;
9676 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009677 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009679
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9681
9682 where ! is a negation and \p{xxx} is a character with property xxx.
9683 */
9684 for (j = i - 1; j >= 0; j--) {
9685 c = PyUnicode_READ(kind, data, j);
9686 if (!_PyUnicode_IsCaseIgnorable(c))
9687 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9690 if (final_sigma) {
9691 for (j = i + 1; j < length; j++) {
9692 c = PyUnicode_READ(kind, data, j);
9693 if (!_PyUnicode_IsCaseIgnorable(c))
9694 break;
9695 }
9696 final_sigma = j == length || !_PyUnicode_IsCased(c);
9697 }
9698 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701static int
9702lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9703 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 /* Obscure special case. */
9706 if (c == 0x3A3) {
9707 mapped[0] = handle_capital_sigma(kind, data, length, i);
9708 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711}
9712
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713static Py_ssize_t
9714do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 Py_ssize_t i, k = 0;
9717 int n_res, j;
9718 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 c = PyUnicode_READ(kind, data, 0);
9721 n_res = _PyUnicode_ToUpperFull(c, mapped);
9722 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009723 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 for (i = 1; i < length; i++) {
9727 c = PyUnicode_READ(kind, data, i);
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009730 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009732 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009733 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737static Py_ssize_t
9738do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9739 Py_ssize_t i, k = 0;
9740
9741 for (i = 0; i < length; i++) {
9742 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9743 int n_res, j;
9744 if (Py_UNICODE_ISUPPER(c)) {
9745 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9746 }
9747 else if (Py_UNICODE_ISLOWER(c)) {
9748 n_res = _PyUnicode_ToUpperFull(c, mapped);
9749 }
9750 else {
9751 n_res = 1;
9752 mapped[0] = c;
9753 }
9754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
9757 }
9758 }
9759 return k;
9760}
9761
9762static Py_ssize_t
9763do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9764 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766 Py_ssize_t i, k = 0;
9767
9768 for (i = 0; i < length; i++) {
9769 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9770 int n_res, j;
9771 if (lower)
9772 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9773 else
9774 n_res = _PyUnicode_ToUpperFull(c, mapped);
9775 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009776 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 res[k++] = mapped[j];
9778 }
9779 }
9780 return k;
9781}
9782
9783static Py_ssize_t
9784do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9785{
9786 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9787}
9788
9789static Py_ssize_t
9790do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9793}
9794
Benjamin Petersone51757f2012-01-12 21:10:29 -05009795static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009796do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799
9800 for (i = 0; i < length; i++) {
9801 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9802 Py_UCS4 mapped[3];
9803 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9804 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009806 res[k++] = mapped[j];
9807 }
9808 }
9809 return k;
9810}
9811
9812static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009813do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9814{
9815 Py_ssize_t i, k = 0;
9816 int previous_is_cased;
9817
9818 previous_is_cased = 0;
9819 for (i = 0; i < length; i++) {
9820 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9821 Py_UCS4 mapped[3];
9822 int n_res, j;
9823
9824 if (previous_is_cased)
9825 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9826 else
9827 n_res = _PyUnicode_ToTitleFull(c, mapped);
9828
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009831 res[k++] = mapped[j];
9832 }
9833
9834 previous_is_cased = _PyUnicode_IsCased(c);
9835 }
9836 return k;
9837}
9838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839static PyObject *
9840case_operation(PyObject *self,
9841 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9842{
9843 PyObject *res = NULL;
9844 Py_ssize_t length, newlength = 0;
9845 int kind, outkind;
9846 void *data, *outdata;
9847 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9848
Benjamin Petersoneea48462012-01-16 14:28:50 -05009849 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009850
9851 kind = PyUnicode_KIND(self);
9852 data = PyUnicode_DATA(self);
9853 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009854 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009855 PyErr_SetString(PyExc_OverflowError, "string is too long");
9856 return NULL;
9857 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009858 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 if (tmp == NULL)
9860 return PyErr_NoMemory();
9861 newlength = perform(kind, data, length, tmp, &maxchar);
9862 res = PyUnicode_New(newlength, maxchar);
9863 if (res == NULL)
9864 goto leave;
9865 tmpend = tmp + newlength;
9866 outdata = PyUnicode_DATA(res);
9867 outkind = PyUnicode_KIND(res);
9868 switch (outkind) {
9869 case PyUnicode_1BYTE_KIND:
9870 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9871 break;
9872 case PyUnicode_2BYTE_KIND:
9873 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9874 break;
9875 case PyUnicode_4BYTE_KIND:
9876 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9877 break;
9878 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009879 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 }
9881 leave:
9882 PyMem_FREE(tmp);
9883 return res;
9884}
9885
Tim Peters8ce9f162004-08-27 01:49:32 +00009886PyObject *
9887PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 PyObject *res;
9890 PyObject *fseq;
9891 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009892 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009894 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009897 }
9898
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009899 /* NOTE: the following code can't call back into Python code,
9900 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009901 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009902
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009903 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009905 res = _PyUnicode_JoinArray(separator, items, seqlen);
9906 Py_DECREF(fseq);
9907 return res;
9908}
9909
9910PyObject *
9911_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9912{
9913 PyObject *res = NULL; /* the result */
9914 PyObject *sep = NULL;
9915 Py_ssize_t seplen;
9916 PyObject *item;
9917 Py_ssize_t sz, i, res_offset;
9918 Py_UCS4 maxchar;
9919 Py_UCS4 item_maxchar;
9920 int use_memcpy;
9921 unsigned char *res_data = NULL, *sep_data = NULL;
9922 PyObject *last_obj;
9923 unsigned int kind = 0;
9924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If empty sequence, return u"". */
9926 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009927 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009929
Tim Peters05eba1f2004-08-27 21:32:02 +00009930 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009931 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009932 if (seqlen == 1) {
9933 if (PyUnicode_CheckExact(items[0])) {
9934 res = items[0];
9935 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009936 return res;
9937 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009938 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009939 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009940 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009942 /* Set up sep and seplen */
9943 if (separator == NULL) {
9944 /* fall back to a blank space separator */
9945 sep = PyUnicode_FromOrdinal(' ');
9946 if (!sep)
9947 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009949 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009950 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009951 else {
9952 if (!PyUnicode_Check(separator)) {
9953 PyErr_Format(PyExc_TypeError,
9954 "separator: expected str instance,"
9955 " %.80s found",
9956 Py_TYPE(separator)->tp_name);
9957 goto onError;
9958 }
9959 if (PyUnicode_READY(separator))
9960 goto onError;
9961 sep = separator;
9962 seplen = PyUnicode_GET_LENGTH(separator);
9963 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9964 /* inc refcount to keep this code path symmetric with the
9965 above case of a blank separator */
9966 Py_INCREF(sep);
9967 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009968 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009969 }
9970
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009971 /* There are at least two things to join, or else we have a subclass
9972 * of str in the sequence.
9973 * Do a pre-pass to figure out the total amount of space we'll
9974 * need (sz), and see whether all argument are strings.
9975 */
9976 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977#ifdef Py_DEBUG
9978 use_memcpy = 0;
9979#else
9980 use_memcpy = 1;
9981#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009982 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009983 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009984 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 if (!PyUnicode_Check(item)) {
9986 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009987 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 " %.80s found",
9989 i, Py_TYPE(item)->tp_name);
9990 goto onError;
9991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (PyUnicode_READY(item) == -1)
9993 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009994 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009996 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009997 if (i != 0) {
9998 add_sz += seplen;
9999 }
10000 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 goto onError;
10004 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010005 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010006 if (use_memcpy && last_obj != NULL) {
10007 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10008 use_memcpy = 0;
10009 }
10010 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010011 }
Tim Petersced69f82003-09-16 20:30:58 +000010012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 if (res == NULL)
10015 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010016
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010018#ifdef Py_DEBUG
10019 use_memcpy = 0;
10020#else
10021 if (use_memcpy) {
10022 res_data = PyUnicode_1BYTE_DATA(res);
10023 kind = PyUnicode_KIND(res);
10024 if (seplen != 0)
10025 sep_data = PyUnicode_1BYTE_DATA(sep);
10026 }
10027#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010028 if (use_memcpy) {
10029 for (i = 0; i < seqlen; ++i) {
10030 Py_ssize_t itemlen;
10031 item = items[i];
10032
10033 /* Copy item, and maybe the separator. */
10034 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010035 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010036 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 kind * seplen);
10038 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010040
10041 itemlen = PyUnicode_GET_LENGTH(item);
10042 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010043 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010044 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010045 kind * itemlen);
10046 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010048 }
10049 assert(res_data == PyUnicode_1BYTE_DATA(res)
10050 + kind * PyUnicode_GET_LENGTH(res));
10051 }
10052 else {
10053 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10054 Py_ssize_t itemlen;
10055 item = items[i];
10056
10057 /* Copy item, and maybe the separator. */
10058 if (i && seplen != 0) {
10059 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10060 res_offset += seplen;
10061 }
10062
10063 itemlen = PyUnicode_GET_LENGTH(item);
10064 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010065 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010066 res_offset += itemlen;
10067 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010068 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010070 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010073 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010078 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 return NULL;
10080}
10081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082#define FILL(kind, data, value, start, length) \
10083 do { \
10084 Py_ssize_t i_ = 0; \
10085 assert(kind != PyUnicode_WCHAR_KIND); \
10086 switch ((kind)) { \
10087 case PyUnicode_1BYTE_KIND: { \
10088 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010089 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 break; \
10091 } \
10092 case PyUnicode_2BYTE_KIND: { \
10093 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10094 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10095 break; \
10096 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010097 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10099 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10100 break; \
10101 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010102 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 } \
10104 } while (0)
10105
Victor Stinnerd3f08822012-05-29 12:57:52 +020010106void
10107_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10108 Py_UCS4 fill_char)
10109{
10110 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10111 const void *data = PyUnicode_DATA(unicode);
10112 assert(PyUnicode_IS_READY(unicode));
10113 assert(unicode_modifiable(unicode));
10114 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10115 assert(start >= 0);
10116 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10117 FILL(kind, data, fill_char, start, length);
10118}
10119
Victor Stinner3fe55312012-01-04 00:33:50 +010010120Py_ssize_t
10121PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10122 Py_UCS4 fill_char)
10123{
10124 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010125
10126 if (!PyUnicode_Check(unicode)) {
10127 PyErr_BadInternalCall();
10128 return -1;
10129 }
10130 if (PyUnicode_READY(unicode) == -1)
10131 return -1;
10132 if (unicode_check_modifiable(unicode))
10133 return -1;
10134
Victor Stinnerd3f08822012-05-29 12:57:52 +020010135 if (start < 0) {
10136 PyErr_SetString(PyExc_IndexError, "string index out of range");
10137 return -1;
10138 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010139 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10140 PyErr_SetString(PyExc_ValueError,
10141 "fill character is bigger than "
10142 "the string maximum character");
10143 return -1;
10144 }
10145
10146 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10147 length = Py_MIN(maxlen, length);
10148 if (length <= 0)
10149 return 0;
10150
Victor Stinnerd3f08822012-05-29 12:57:52 +020010151 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010152 return length;
10153}
10154
Victor Stinner9310abb2011-10-05 00:59:23 +020010155static PyObject *
10156pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010157 Py_ssize_t left,
10158 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 PyObject *u;
10162 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010163 int kind;
10164 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
10166 if (left < 0)
10167 left = 0;
10168 if (right < 0)
10169 right = 0;
10170
Victor Stinnerc4b49542011-12-11 22:44:26 +010010171 if (left == 0 && right == 0)
10172 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10175 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010176 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10177 return NULL;
10178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010180 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010182 if (!u)
10183 return NULL;
10184
10185 kind = PyUnicode_KIND(u);
10186 data = PyUnicode_DATA(u);
10187 if (left)
10188 FILL(kind, data, fill, 0, left);
10189 if (right)
10190 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010191 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010192 assert(_PyUnicode_CheckConsistency(u, 1));
10193 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194}
10195
Alexander Belopolsky40018472011-02-26 01:02:56 +000010196PyObject *
10197PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010201 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Benjamin Petersonead6b532011-12-20 17:23:42 -060010204 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206 if (PyUnicode_IS_ASCII(string))
10207 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010208 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 PyUnicode_GET_LENGTH(string), keepends);
10210 else
10211 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 break;
10215 case PyUnicode_2BYTE_KIND:
10216 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 PyUnicode_GET_LENGTH(string), keepends);
10219 break;
10220 case PyUnicode_4BYTE_KIND:
10221 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyUnicode_GET_LENGTH(string), keepends);
10224 break;
10225 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010226 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229}
10230
Alexander Belopolsky40018472011-02-26 01:02:56 +000010231static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010232split(PyObject *self,
10233 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010234 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010236 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 void *buf1, *buf2;
10238 Py_ssize_t len1, len2;
10239 PyObject* out;
10240
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010242 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (PyUnicode_READY(self) == -1)
10245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010248 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010250 if (PyUnicode_IS_ASCII(self))
10251 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010253 PyUnicode_GET_LENGTH(self), maxcount
10254 );
10255 else
10256 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 PyUnicode_GET_LENGTH(self), maxcount
10259 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 case PyUnicode_2BYTE_KIND:
10261 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
10265 case PyUnicode_4BYTE_KIND:
10266 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010271 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 }
10273
10274 if (PyUnicode_READY(substring) == -1)
10275 return NULL;
10276
10277 kind1 = PyUnicode_KIND(self);
10278 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 len1 = PyUnicode_GET_LENGTH(self);
10280 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010281 if (kind1 < kind2 || len1 < len2) {
10282 out = PyList_New(1);
10283 if (out == NULL)
10284 return NULL;
10285 Py_INCREF(self);
10286 PyList_SET_ITEM(out, 0, self);
10287 return out;
10288 }
10289 buf1 = PyUnicode_DATA(self);
10290 buf2 = PyUnicode_DATA(substring);
10291 if (kind2 != kind1) {
10292 buf2 = _PyUnicode_AsKind(substring, kind1);
10293 if (!buf2)
10294 return NULL;
10295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010297 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10300 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 else
10303 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 break;
10306 case PyUnicode_2BYTE_KIND:
10307 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 break;
10310 case PyUnicode_4BYTE_KIND:
10311 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 break;
10314 default:
10315 out = NULL;
10316 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010317 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 PyMem_Free(buf2);
10319 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010323rsplit(PyObject *self,
10324 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010325 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010326{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010327 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 void *buf1, *buf2;
10329 Py_ssize_t len1, len2;
10330 PyObject* out;
10331
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010332 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010333 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (PyUnicode_READY(self) == -1)
10336 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010339 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(self))
10342 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 PyUnicode_GET_LENGTH(self), maxcount
10345 );
10346 else
10347 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(self), maxcount
10350 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 case PyUnicode_2BYTE_KIND:
10352 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
10356 case PyUnicode_4BYTE_KIND:
10357 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010362 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 }
10364
10365 if (PyUnicode_READY(substring) == -1)
10366 return NULL;
10367
10368 kind1 = PyUnicode_KIND(self);
10369 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 len1 = PyUnicode_GET_LENGTH(self);
10371 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010372 if (kind1 < kind2 || len1 < len2) {
10373 out = PyList_New(1);
10374 if (out == NULL)
10375 return NULL;
10376 Py_INCREF(self);
10377 PyList_SET_ITEM(out, 0, self);
10378 return out;
10379 }
10380 buf1 = PyUnicode_DATA(self);
10381 buf2 = PyUnicode_DATA(substring);
10382 if (kind2 != kind1) {
10383 buf2 = _PyUnicode_AsKind(substring, kind1);
10384 if (!buf2)
10385 return NULL;
10386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010388 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10391 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010392 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393 else
10394 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 break;
10397 case PyUnicode_2BYTE_KIND:
10398 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 break;
10401 case PyUnicode_4BYTE_KIND:
10402 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 break;
10405 default:
10406 out = NULL;
10407 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010408 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 PyMem_Free(buf2);
10410 return out;
10411}
10412
10413static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010414anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10415 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010417 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10420 return asciilib_find(buf1, len1, buf2, len2, offset);
10421 else
10422 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 case PyUnicode_2BYTE_KIND:
10424 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10425 case PyUnicode_4BYTE_KIND:
10426 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10427 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010428 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429}
10430
10431static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10433 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010435 switch (kind) {
10436 case PyUnicode_1BYTE_KIND:
10437 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10438 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10439 else
10440 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10441 case PyUnicode_2BYTE_KIND:
10442 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10443 case PyUnicode_4BYTE_KIND:
10444 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10445 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010446 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010447}
10448
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010449static void
10450replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10451 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10452{
10453 int kind = PyUnicode_KIND(u);
10454 void *data = PyUnicode_DATA(u);
10455 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10456 if (kind == PyUnicode_1BYTE_KIND) {
10457 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10458 (Py_UCS1 *)data + len,
10459 u1, u2, maxcount);
10460 }
10461 else if (kind == PyUnicode_2BYTE_KIND) {
10462 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10463 (Py_UCS2 *)data + len,
10464 u1, u2, maxcount);
10465 }
10466 else {
10467 assert(kind == PyUnicode_4BYTE_KIND);
10468 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10469 (Py_UCS4 *)data + len,
10470 u1, u2, maxcount);
10471 }
10472}
10473
Alexander Belopolsky40018472011-02-26 01:02:56 +000010474static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475replace(PyObject *self, PyObject *str1,
10476 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 PyObject *u;
10479 char *sbuf = PyUnicode_DATA(self);
10480 char *buf1 = PyUnicode_DATA(str1);
10481 char *buf2 = PyUnicode_DATA(str2);
10482 int srelease = 0, release1 = 0, release2 = 0;
10483 int skind = PyUnicode_KIND(self);
10484 int kind1 = PyUnicode_KIND(str1);
10485 int kind2 = PyUnicode_KIND(str2);
10486 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10487 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10488 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010493 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010495 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Victor Stinner59de0ee2011-10-07 10:01:28 +020010497 if (str1 == str2)
10498 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499
Victor Stinner49a0a212011-10-12 23:46:10 +020010500 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010501 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10502 if (maxchar < maxchar_str1)
10503 /* substring too wide to be present */
10504 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10506 /* Replacing str1 with str2 may cause a maxchar reduction in the
10507 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010509 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010519
Victor Stinner69ed0f42013-04-09 21:48:24 +020010520 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010521 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010522 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010524 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10530 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 }
10532 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 int rkind = skind;
10534 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (kind1 < rkind) {
10538 /* widen substring */
10539 buf1 = _PyUnicode_AsKind(str1, rkind);
10540 if (!buf1) goto error;
10541 release1 = 1;
10542 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 if (i < 0)
10545 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (rkind > kind2) {
10547 /* widen replacement */
10548 buf2 = _PyUnicode_AsKind(str2, rkind);
10549 if (!buf2) goto error;
10550 release2 = 1;
10551 }
10552 else if (rkind < kind2) {
10553 /* widen self and buf1 */
10554 rkind = kind2;
10555 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010556 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 sbuf = _PyUnicode_AsKind(self, rkind);
10558 if (!sbuf) goto error;
10559 srelease = 1;
10560 buf1 = _PyUnicode_AsKind(str1, rkind);
10561 if (!buf1) goto error;
10562 release1 = 1;
10563 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 u = PyUnicode_New(slen, maxchar);
10565 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 assert(PyUnicode_KIND(u) == rkind);
10568 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010569
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010571 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576
10577 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010580 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010581 if (i == -1)
10582 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010589 }
10590 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010592 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 int rkind = skind;
10594 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf1 = _PyUnicode_AsKind(str1, rkind);
10599 if (!buf1) goto error;
10600 release1 = 1;
10601 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 if (n == 0)
10604 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf2 = _PyUnicode_AsKind(str2, rkind);
10608 if (!buf2) goto error;
10609 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 rkind = kind2;
10614 sbuf = _PyUnicode_AsKind(self, rkind);
10615 if (!sbuf) goto error;
10616 srelease = 1;
10617 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010618 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf1 = _PyUnicode_AsKind(str1, rkind);
10620 if (!buf1) goto error;
10621 release1 = 1;
10622 }
10623 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10624 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 PyErr_SetString(PyExc_OverflowError,
10627 "replace string is too long");
10628 goto error;
10629 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010630 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010631 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010632 _Py_INCREF_UNICODE_EMPTY();
10633 if (!unicode_empty)
10634 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 u = unicode_empty;
10636 goto done;
10637 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010638 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 PyErr_SetString(PyExc_OverflowError,
10640 "replace string is too long");
10641 goto error;
10642 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 u = PyUnicode_New(new_size, maxchar);
10644 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 assert(PyUnicode_KIND(u) == rkind);
10647 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires = i = 0;
10649 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 while (n-- > 0) {
10651 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010652 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010654 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010655 if (j == -1)
10656 break;
10657 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * ires,
10660 sbuf + rkind * i,
10661 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
10664 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
10676 sbuf + rkind * i,
10677 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 }
10679 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 /* interleave */
10681 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 if (--n <= 0)
10687 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010688 memcpy(res + rkind * ires,
10689 sbuf + rkind * i,
10690 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 ires++;
10692 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
10695 sbuf + rkind * i,
10696 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010698 }
10699
10700 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010701 unicode_adjust_maxchar(&u);
10702 if (u == NULL)
10703 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705
10706 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (srelease)
10708 PyMem_FREE(sbuf);
10709 if (release1)
10710 PyMem_FREE(buf1);
10711 if (release2)
10712 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010713 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (srelease)
10719 PyMem_FREE(sbuf);
10720 if (release1)
10721 PyMem_FREE(buf1);
10722 if (release2)
10723 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010724 return unicode_result_unchanged(self);
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 error:
10727 if (srelease && sbuf)
10728 PyMem_FREE(sbuf);
10729 if (release1 && buf1)
10730 PyMem_FREE(buf1);
10731 if (release2 && buf2)
10732 PyMem_FREE(buf2);
10733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734}
10735
10736/* --- Unicode Object Methods --------------------------------------------- */
10737
INADA Naoki3ae20562017-01-16 20:41:20 +090010738/*[clinic input]
10739str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
INADA Naoki3ae20562017-01-16 20:41:20 +090010741Return a version of the string where each word is titlecased.
10742
10743More specifically, words start with uppercased characters and all remaining
10744cased characters have lower case.
10745[clinic start generated code]*/
10746
10747static PyObject *
10748unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010749/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010751 if (PyUnicode_READY(self) == -1)
10752 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010753 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754}
10755
INADA Naoki3ae20562017-01-16 20:41:20 +090010756/*[clinic input]
10757str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758
INADA Naoki3ae20562017-01-16 20:41:20 +090010759Return a capitalized version of the string.
10760
10761More specifically, make the first character have upper case and the rest lower
10762case.
10763[clinic start generated code]*/
10764
10765static PyObject *
10766unicode_capitalize_impl(PyObject *self)
10767/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010769 if (PyUnicode_READY(self) == -1)
10770 return NULL;
10771 if (PyUnicode_GET_LENGTH(self) == 0)
10772 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010773 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774}
10775
INADA Naoki3ae20562017-01-16 20:41:20 +090010776/*[clinic input]
10777str.casefold as unicode_casefold
10778
10779Return a version of the string suitable for caseless comparisons.
10780[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010781
10782static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010783unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010784/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010785{
10786 if (PyUnicode_READY(self) == -1)
10787 return NULL;
10788 if (PyUnicode_IS_ASCII(self))
10789 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010790 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010791}
10792
10793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010794/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795
10796static int
10797convert_uc(PyObject *obj, void *addr)
10798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801 if (!PyUnicode_Check(obj)) {
10802 PyErr_Format(PyExc_TypeError,
10803 "The fill character must be a unicode character, "
10804 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 0;
10806 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010807 if (PyUnicode_READY(obj) < 0)
10808 return 0;
10809 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010810 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010815 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010816}
10817
INADA Naoki3ae20562017-01-16 20:41:20 +090010818/*[clinic input]
10819str.center as unicode_center
10820
10821 width: Py_ssize_t
10822 fillchar: Py_UCS4 = ' '
10823 /
10824
10825Return a centered string of length width.
10826
10827Padding is done using the specified fill character (default is a space).
10828[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
10830static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010831unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10832/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010834 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
Benjamin Petersonbac79492012-01-14 13:34:47 -050010836 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 return NULL;
10838
Victor Stinnerc4b49542011-12-11 22:44:26 +010010839 if (PyUnicode_GET_LENGTH(self) >= width)
10840 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Victor Stinnerc4b49542011-12-11 22:44:26 +010010842 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 left = marg / 2 + (marg & width & 1);
10844
Victor Stinner9310abb2011-10-05 00:59:23 +020010845 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846}
10847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848/* This function assumes that str1 and str2 are readied by the caller. */
10849
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010851unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010852{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010853#define COMPARE(TYPE1, TYPE2) \
10854 do { \
10855 TYPE1* p1 = (TYPE1 *)data1; \
10856 TYPE2* p2 = (TYPE2 *)data2; \
10857 TYPE1* end = p1 + len; \
10858 Py_UCS4 c1, c2; \
10859 for (; p1 != end; p1++, p2++) { \
10860 c1 = *p1; \
10861 c2 = *p2; \
10862 if (c1 != c2) \
10863 return (c1 < c2) ? -1 : 1; \
10864 } \
10865 } \
10866 while (0)
10867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 int kind1, kind2;
10869 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010870 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 kind1 = PyUnicode_KIND(str1);
10873 kind2 = PyUnicode_KIND(str2);
10874 data1 = PyUnicode_DATA(str1);
10875 data2 = PyUnicode_DATA(str2);
10876 len1 = PyUnicode_GET_LENGTH(str1);
10877 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010878 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010879
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 switch(kind1) {
10881 case PyUnicode_1BYTE_KIND:
10882 {
10883 switch(kind2) {
10884 case PyUnicode_1BYTE_KIND:
10885 {
10886 int cmp = memcmp(data1, data2, len);
10887 /* normalize result of memcmp() into the range [-1; 1] */
10888 if (cmp < 0)
10889 return -1;
10890 if (cmp > 0)
10891 return 1;
10892 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010893 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010894 case PyUnicode_2BYTE_KIND:
10895 COMPARE(Py_UCS1, Py_UCS2);
10896 break;
10897 case PyUnicode_4BYTE_KIND:
10898 COMPARE(Py_UCS1, Py_UCS4);
10899 break;
10900 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010901 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 }
10903 break;
10904 }
10905 case PyUnicode_2BYTE_KIND:
10906 {
10907 switch(kind2) {
10908 case PyUnicode_1BYTE_KIND:
10909 COMPARE(Py_UCS2, Py_UCS1);
10910 break;
10911 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010912 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 COMPARE(Py_UCS2, Py_UCS2);
10914 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010915 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010916 case PyUnicode_4BYTE_KIND:
10917 COMPARE(Py_UCS2, Py_UCS4);
10918 break;
10919 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010920 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010921 }
10922 break;
10923 }
10924 case PyUnicode_4BYTE_KIND:
10925 {
10926 switch(kind2) {
10927 case PyUnicode_1BYTE_KIND:
10928 COMPARE(Py_UCS4, Py_UCS1);
10929 break;
10930 case PyUnicode_2BYTE_KIND:
10931 COMPARE(Py_UCS4, Py_UCS2);
10932 break;
10933 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010934 {
10935#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10936 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10937 /* normalize result of wmemcmp() into the range [-1; 1] */
10938 if (cmp < 0)
10939 return -1;
10940 if (cmp > 0)
10941 return 1;
10942#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010943 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010944#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010945 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010946 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010947 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010948 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010949 }
10950 break;
10951 }
10952 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010953 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010954 }
10955
Victor Stinner770e19e2012-10-04 22:59:45 +020010956 if (len1 == len2)
10957 return 0;
10958 if (len1 < len2)
10959 return -1;
10960 else
10961 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010962
10963#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010964}
10965
Benjamin Peterson621b4302016-09-09 13:54:34 -070010966static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010967unicode_compare_eq(PyObject *str1, PyObject *str2)
10968{
10969 int kind;
10970 void *data1, *data2;
10971 Py_ssize_t len;
10972 int cmp;
10973
Victor Stinnere5567ad2012-10-23 02:48:49 +020010974 len = PyUnicode_GET_LENGTH(str1);
10975 if (PyUnicode_GET_LENGTH(str2) != len)
10976 return 0;
10977 kind = PyUnicode_KIND(str1);
10978 if (PyUnicode_KIND(str2) != kind)
10979 return 0;
10980 data1 = PyUnicode_DATA(str1);
10981 data2 = PyUnicode_DATA(str2);
10982
10983 cmp = memcmp(data1, data2, len * kind);
10984 return (cmp == 0);
10985}
10986
10987
Alexander Belopolsky40018472011-02-26 01:02:56 +000010988int
10989PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10992 if (PyUnicode_READY(left) == -1 ||
10993 PyUnicode_READY(right) == -1)
10994 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010995
10996 /* a string is equal to itself */
10997 if (left == right)
10998 return 0;
10999
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011000 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011002 PyErr_Format(PyExc_TypeError,
11003 "Can't compare %.100s and %.100s",
11004 left->ob_type->tp_name,
11005 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 return -1;
11007}
11008
Martin v. Löwis5b222132007-06-10 09:51:05 +000011009int
11010PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 Py_ssize_t i;
11013 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011015 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016
Victor Stinner910337b2011-10-03 03:20:16 +020011017 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011018 if (!PyUnicode_IS_READY(uni)) {
11019 const wchar_t *ws = _PyUnicode_WSTR(uni);
11020 /* Compare Unicode string and source character set string */
11021 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11022 if (chr != ustr[i])
11023 return (chr < ustr[i]) ? -1 : 1;
11024 }
11025 /* This check keeps Python strings that end in '\0' from comparing equal
11026 to C strings identical up to that point. */
11027 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11028 return 1; /* uni is longer */
11029 if (ustr[i])
11030 return -1; /* str is longer */
11031 return 0;
11032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011034 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011035 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011036 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 size_t len, len2 = strlen(str);
11038 int cmp;
11039
11040 len = Py_MIN(len1, len2);
11041 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011042 if (cmp != 0) {
11043 if (cmp < 0)
11044 return -1;
11045 else
11046 return 1;
11047 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011048 if (len1 > len2)
11049 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011050 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011051 return -1; /* str is longer */
11052 return 0;
11053 }
11054 else {
11055 void *data = PyUnicode_DATA(uni);
11056 /* Compare Unicode string and source character set string */
11057 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011058 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011059 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11060 /* This check keeps Python strings that end in '\0' from comparing equal
11061 to C strings identical up to that point. */
11062 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11063 return 1; /* uni is longer */
11064 if (str[i])
11065 return -1; /* str is longer */
11066 return 0;
11067 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011068}
11069
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011070static int
11071non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11072{
11073 size_t i, len;
11074 const wchar_t *p;
11075 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11076 if (strlen(str) != len)
11077 return 0;
11078 p = _PyUnicode_WSTR(unicode);
11079 assert(p);
11080 for (i = 0; i < len; i++) {
11081 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011082 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011083 return 0;
11084 }
11085 return 1;
11086}
11087
11088int
11089_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11090{
11091 size_t len;
11092 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011093 assert(str);
11094#ifndef NDEBUG
11095 for (const char *p = str; *p; p++) {
11096 assert((unsigned char)*p < 128);
11097 }
11098#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011099 if (PyUnicode_READY(unicode) == -1) {
11100 /* Memory error or bad data */
11101 PyErr_Clear();
11102 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11103 }
11104 if (!PyUnicode_IS_ASCII(unicode))
11105 return 0;
11106 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11107 return strlen(str) == len &&
11108 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11109}
11110
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011111int
11112_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11113{
11114 PyObject *right_uni;
11115 Py_hash_t hash;
11116
11117 assert(_PyUnicode_CHECK(left));
11118 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011119#ifndef NDEBUG
11120 for (const char *p = right->string; *p; p++) {
11121 assert((unsigned char)*p < 128);
11122 }
11123#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011124
11125 if (PyUnicode_READY(left) == -1) {
11126 /* memory error or bad data */
11127 PyErr_Clear();
11128 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11129 }
11130
11131 if (!PyUnicode_IS_ASCII(left))
11132 return 0;
11133
11134 right_uni = _PyUnicode_FromId(right); /* borrowed */
11135 if (right_uni == NULL) {
11136 /* memory error or bad data */
11137 PyErr_Clear();
11138 return _PyUnicode_EqualToASCIIString(left, right->string);
11139 }
11140
11141 if (left == right_uni)
11142 return 1;
11143
11144 if (PyUnicode_CHECK_INTERNED(left))
11145 return 0;
11146
11147 assert(_PyUnicode_HASH(right_uni) != 1);
11148 hash = _PyUnicode_HASH(left);
11149 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11150 return 0;
11151
11152 return unicode_compare_eq(left, right_uni);
11153}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011154
Benjamin Peterson29060642009-01-31 22:14:21 +000011155#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011156 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011157
Alexander Belopolsky40018472011-02-26 01:02:56 +000011158PyObject *
11159PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011160{
11161 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011162 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163
Victor Stinnere5567ad2012-10-23 02:48:49 +020011164 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11165 Py_RETURN_NOTIMPLEMENTED;
11166
11167 if (PyUnicode_READY(left) == -1 ||
11168 PyUnicode_READY(right) == -1)
11169 return NULL;
11170
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011171 if (left == right) {
11172 switch (op) {
11173 case Py_EQ:
11174 case Py_LE:
11175 case Py_GE:
11176 /* a string is equal to itself */
11177 v = Py_True;
11178 break;
11179 case Py_NE:
11180 case Py_LT:
11181 case Py_GT:
11182 v = Py_False;
11183 break;
11184 default:
11185 PyErr_BadArgument();
11186 return NULL;
11187 }
11188 }
11189 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011190 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011191 result ^= (op == Py_NE);
11192 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011193 }
11194 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011195 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011196
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011197 /* Convert the return value to a Boolean */
11198 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011199 case Py_LE:
11200 v = TEST_COND(result <= 0);
11201 break;
11202 case Py_GE:
11203 v = TEST_COND(result >= 0);
11204 break;
11205 case Py_LT:
11206 v = TEST_COND(result == -1);
11207 break;
11208 case Py_GT:
11209 v = TEST_COND(result == 1);
11210 break;
11211 default:
11212 PyErr_BadArgument();
11213 return NULL;
11214 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011215 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011216 Py_INCREF(v);
11217 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011218}
11219
Alexander Belopolsky40018472011-02-26 01:02:56 +000011220int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011221_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11222{
11223 return unicode_eq(aa, bb);
11224}
11225
11226int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011228{
Victor Stinner77282cb2013-04-14 19:22:47 +020011229 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 void *buf1, *buf2;
11231 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011232 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011233
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 "'in <string>' requires string as left operand, not %.100s",
11237 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011238 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011239 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011241 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 if (ensure_unicode(str) < 0)
11243 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011246 kind2 = PyUnicode_KIND(substr);
11247 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011248 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 len2 = PyUnicode_GET_LENGTH(substr);
11251 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011252 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011253 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011254 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011255 if (len2 == 1) {
11256 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11257 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011258 return result;
11259 }
11260 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011261 buf2 = _PyUnicode_AsKind(substr, kind1);
11262 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011263 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265
Victor Stinner77282cb2013-04-14 19:22:47 +020011266 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 case PyUnicode_1BYTE_KIND:
11268 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11269 break;
11270 case PyUnicode_2BYTE_KIND:
11271 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11272 break;
11273 case PyUnicode_4BYTE_KIND:
11274 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11275 break;
11276 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011277 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011279
Victor Stinner77282cb2013-04-14 19:22:47 +020011280 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 PyMem_Free(buf2);
11282
Guido van Rossum403d68b2000-03-13 15:55:09 +000011283 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011284}
11285
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286/* Concat to string or Unicode object giving a new Unicode object. */
11287
Alexander Belopolsky40018472011-02-26 01:02:56 +000011288PyObject *
11289PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011292 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011293 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011295 if (ensure_unicode(left) < 0)
11296 return NULL;
11297
11298 if (!PyUnicode_Check(right)) {
11299 PyErr_Format(PyExc_TypeError,
11300 "can only concatenate str (not \"%.200s\") to str",
11301 right->ob_type->tp_name);
11302 return NULL;
11303 }
11304 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
11307 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 if (left == unicode_empty)
11309 return PyUnicode_FromObject(right);
11310 if (right == unicode_empty)
11311 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011313 left_len = PyUnicode_GET_LENGTH(left);
11314 right_len = PyUnicode_GET_LENGTH(right);
11315 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011316 PyErr_SetString(PyExc_OverflowError,
11317 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011319 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011320 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011321
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011322 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11323 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011324 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011327 result = PyUnicode_New(new_len, maxchar);
11328 if (result == NULL)
11329 return NULL;
11330 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11331 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11332 assert(_PyUnicode_CheckConsistency(result, 1));
11333 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334}
11335
Walter Dörwald1ab83302007-05-18 17:15:44 +000011336void
Victor Stinner23e56682011-10-03 03:54:37 +020011337PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011338{
Victor Stinner23e56682011-10-03 03:54:37 +020011339 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011340 Py_UCS4 maxchar, maxchar2;
11341 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011342
11343 if (p_left == NULL) {
11344 if (!PyErr_Occurred())
11345 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011346 return;
11347 }
Victor Stinner23e56682011-10-03 03:54:37 +020011348 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011349 if (right == NULL || left == NULL
11350 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011351 if (!PyErr_Occurred())
11352 PyErr_BadInternalCall();
11353 goto error;
11354 }
11355
Benjamin Petersonbac79492012-01-14 13:34:47 -050011356 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011357 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011358 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011359 goto error;
11360
Victor Stinner488fa492011-12-12 00:01:39 +010011361 /* Shortcuts */
11362 if (left == unicode_empty) {
11363 Py_DECREF(left);
11364 Py_INCREF(right);
11365 *p_left = right;
11366 return;
11367 }
11368 if (right == unicode_empty)
11369 return;
11370
11371 left_len = PyUnicode_GET_LENGTH(left);
11372 right_len = PyUnicode_GET_LENGTH(right);
11373 if (left_len > PY_SSIZE_T_MAX - right_len) {
11374 PyErr_SetString(PyExc_OverflowError,
11375 "strings are too large to concat");
11376 goto error;
11377 }
11378 new_len = left_len + right_len;
11379
11380 if (unicode_modifiable(left)
11381 && PyUnicode_CheckExact(right)
11382 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011383 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11384 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011385 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011386 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011387 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11388 {
11389 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011390 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011391 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011392
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011393 /* copy 'right' into the newly allocated area of 'left' */
11394 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011395 }
Victor Stinner488fa492011-12-12 00:01:39 +010011396 else {
11397 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11398 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011399 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011400
Victor Stinner488fa492011-12-12 00:01:39 +010011401 /* Concat the two Unicode strings */
11402 res = PyUnicode_New(new_len, maxchar);
11403 if (res == NULL)
11404 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011405 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11406 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011407 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011408 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011409 }
11410 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011411 return;
11412
11413error:
Victor Stinner488fa492011-12-12 00:01:39 +010011414 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011415}
11416
11417void
11418PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11419{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011420 PyUnicode_Append(pleft, right);
11421 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011422}
11423
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011424/*
11425Wraps stringlib_parse_args_finds() and additionally ensures that the
11426first argument is a unicode object.
11427*/
11428
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011429static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011430parse_args_finds_unicode(const char * function_name, PyObject *args,
11431 PyObject **substring,
11432 Py_ssize_t *start, Py_ssize_t *end)
11433{
11434 if(stringlib_parse_args_finds(function_name, args, substring,
11435 start, end)) {
11436 if (ensure_unicode(*substring) < 0)
11437 return 0;
11438 return 1;
11439 }
11440 return 0;
11441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011446Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011447string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011448interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
11450static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011451unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011453 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011454 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011455 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011457 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 void *buf1, *buf2;
11459 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011461 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 kind1 = PyUnicode_KIND(self);
11465 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011466 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011467 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 len1 = PyUnicode_GET_LENGTH(self);
11470 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011472 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011473 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011474
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011475 buf1 = PyUnicode_DATA(self);
11476 buf2 = PyUnicode_DATA(substring);
11477 if (kind2 != kind1) {
11478 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011479 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011480 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011481 }
11482 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 case PyUnicode_1BYTE_KIND:
11484 iresult = ucs1lib_count(
11485 ((Py_UCS1*)buf1) + start, end - start,
11486 buf2, len2, PY_SSIZE_T_MAX
11487 );
11488 break;
11489 case PyUnicode_2BYTE_KIND:
11490 iresult = ucs2lib_count(
11491 ((Py_UCS2*)buf1) + start, end - start,
11492 buf2, len2, PY_SSIZE_T_MAX
11493 );
11494 break;
11495 case PyUnicode_4BYTE_KIND:
11496 iresult = ucs4lib_count(
11497 ((Py_UCS4*)buf1) + start, end - start,
11498 buf2, len2, PY_SSIZE_T_MAX
11499 );
11500 break;
11501 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011502 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 }
11504
11505 result = PyLong_FromSsize_t(iresult);
11506
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011507 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 return result;
11511}
11512
INADA Naoki3ae20562017-01-16 20:41:20 +090011513/*[clinic input]
11514str.encode as unicode_encode
11515
11516 encoding: str(c_default="NULL") = 'utf-8'
11517 The encoding in which to encode the string.
11518 errors: str(c_default="NULL") = 'strict'
11519 The error handling scheme to use for encoding errors.
11520 The default is 'strict' meaning that encoding errors raise a
11521 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11522 'xmlcharrefreplace' as well as any other name registered with
11523 codecs.register_error that can handle UnicodeEncodeErrors.
11524
11525Encode the string using the codec registered for encoding.
11526[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
11528static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011529unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011530/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011532 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011533}
11534
INADA Naoki3ae20562017-01-16 20:41:20 +090011535/*[clinic input]
11536str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
INADA Naoki3ae20562017-01-16 20:41:20 +090011538 tabsize: int = 8
11539
11540Return a copy where all tab characters are expanded using spaces.
11541
11542If tabsize is not given, a tab size of 8 characters is assumed.
11543[clinic start generated code]*/
11544
11545static PyObject *
11546unicode_expandtabs_impl(PyObject *self, int tabsize)
11547/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011549 Py_ssize_t i, j, line_pos, src_len, incr;
11550 Py_UCS4 ch;
11551 PyObject *u;
11552 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011554 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
Antoine Pitrou22425222011-10-04 19:10:51 +020011556 if (PyUnicode_READY(self) == -1)
11557 return NULL;
11558
Thomas Wouters7e474022000-07-16 12:04:32 +000011559 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 src_len = PyUnicode_GET_LENGTH(self);
11561 i = j = line_pos = 0;
11562 kind = PyUnicode_KIND(self);
11563 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011564 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 for (; i < src_len; i++) {
11566 ch = PyUnicode_READ(kind, src_data, i);
11567 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011568 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011570 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 goto overflow;
11573 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011575 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 goto overflow;
11580 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 if (ch == '\n' || ch == '\r')
11583 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011585 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011586 if (!found)
11587 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011588
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011590 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 if (!u)
11592 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Antoine Pitroue71d5742011-10-04 15:55:09 +020011595 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 for (; i < src_len; i++) {
11598 ch = PyUnicode_READ(kind, src_data, i);
11599 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 incr = tabsize - (line_pos % tabsize);
11602 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011603 FILL(kind, dest_data, ' ', j, incr);
11604 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011606 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011608 line_pos++;
11609 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011610 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011611 if (ch == '\n' || ch == '\r')
11612 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011614 }
11615 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011616 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011617
Antoine Pitroue71d5742011-10-04 15:55:09 +020011618 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011619 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
11626Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011627such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628arguments start and end are interpreted as in slice notation.\n\
11629\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011630Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011635 /* initialize variables to prevent gcc warning */
11636 PyObject *substring = NULL;
11637 Py_ssize_t start = 0;
11638 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011639 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011641 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011647 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (result == -2)
11650 return NULL;
11651
Christian Heimes217cfd12007-12-02 14:31:20 +000011652 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
11655static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011656unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011658 void *data;
11659 enum PyUnicode_Kind kind;
11660 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011661
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011662 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011663 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011665 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011666 if (PyUnicode_READY(self) == -1) {
11667 return NULL;
11668 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011669 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11670 PyErr_SetString(PyExc_IndexError, "string index out of range");
11671 return NULL;
11672 }
11673 kind = PyUnicode_KIND(self);
11674 data = PyUnicode_DATA(self);
11675 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011676 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677}
11678
Guido van Rossumc2504932007-09-18 19:42:40 +000011679/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011680 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011681static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011682unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683{
Guido van Rossumc2504932007-09-18 19:42:40 +000011684 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011685 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011686
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011687#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011688 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011689#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (_PyUnicode_HASH(self) != -1)
11691 return _PyUnicode_HASH(self);
11692 if (PyUnicode_READY(self) == -1)
11693 return -1;
11694 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011695 /*
11696 We make the hash of the empty string be 0, rather than using
11697 (prefix ^ suffix), since this slightly obfuscates the hash secret
11698 */
11699 if (len == 0) {
11700 _PyUnicode_HASH(self) = 0;
11701 return 0;
11702 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011703 x = _Py_HashBytes(PyUnicode_DATA(self),
11704 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011706 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707}
11708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011709PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011712Return the lowest index in S where substring sub is found, \n\
11713such that sub is contained within S[start:end]. Optional\n\
11714arguments start and end are interpreted as in slice notation.\n\
11715\n\
11716Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
11718static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011721 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011722 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011723 PyObject *substring = NULL;
11724 Py_ssize_t start = 0;
11725 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011727 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011730 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011733 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 if (result == -2)
11736 return NULL;
11737
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 if (result < 0) {
11739 PyErr_SetString(PyExc_ValueError, "substring not found");
11740 return NULL;
11741 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011742
Christian Heimes217cfd12007-12-02 14:31:20 +000011743 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744}
11745
INADA Naoki3ae20562017-01-16 20:41:20 +090011746/*[clinic input]
11747str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
INADA Naoki3ae20562017-01-16 20:41:20 +090011749Return True if the string is a lowercase string, False otherwise.
11750
11751A string is lowercase if all cased characters in the string are lowercase and
11752there is at least one cased character in the string.
11753[clinic start generated code]*/
11754
11755static PyObject *
11756unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011757/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 Py_ssize_t i, length;
11760 int kind;
11761 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 int cased;
11763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (PyUnicode_READY(self) == -1)
11765 return NULL;
11766 length = PyUnicode_GET_LENGTH(self);
11767 kind = PyUnicode_KIND(self);
11768 data = PyUnicode_DATA(self);
11769
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 if (length == 1)
11772 return PyBool_FromLong(
11773 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011775 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011777 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011778
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 for (i = 0; i < length; i++) {
11781 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011782
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011784 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 else if (!cased && Py_UNICODE_ISLOWER(ch))
11786 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011788 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789}
11790
INADA Naoki3ae20562017-01-16 20:41:20 +090011791/*[clinic input]
11792str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
INADA Naoki3ae20562017-01-16 20:41:20 +090011794Return True if the string is an uppercase string, False otherwise.
11795
11796A string is uppercase if all cased characters in the string are uppercase and
11797there is at least one cased character in the string.
11798[clinic start generated code]*/
11799
11800static PyObject *
11801unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011802/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 Py_ssize_t i, length;
11805 int kind;
11806 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 int cased;
11808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811 length = PyUnicode_GET_LENGTH(self);
11812 kind = PyUnicode_KIND(self);
11813 data = PyUnicode_DATA(self);
11814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (length == 1)
11817 return PyBool_FromLong(
11818 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011820 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011822 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011823
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 for (i = 0; i < length; i++) {
11826 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011827
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011829 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 else if (!cased && Py_UNICODE_ISUPPER(ch))
11831 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011833 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834}
11835
INADA Naoki3ae20562017-01-16 20:41:20 +090011836/*[clinic input]
11837str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
INADA Naoki3ae20562017-01-16 20:41:20 +090011839Return True if the string is a title-cased string, False otherwise.
11840
11841In a title-cased string, upper- and title-case characters may only
11842follow uncased characters and lowercase characters only cased ones.
11843[clinic start generated code]*/
11844
11845static PyObject *
11846unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011847/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 Py_ssize_t i, length;
11850 int kind;
11851 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 int cased, previous_is_cased;
11853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (PyUnicode_READY(self) == -1)
11855 return NULL;
11856 length = PyUnicode_GET_LENGTH(self);
11857 kind = PyUnicode_KIND(self);
11858 data = PyUnicode_DATA(self);
11859
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 if (length == 1) {
11862 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11863 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11864 (Py_UNICODE_ISUPPER(ch) != 0));
11865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011867 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011869 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011870
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 cased = 0;
11872 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 for (i = 0; i < length; i++) {
11874 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011875
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11877 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011878 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 previous_is_cased = 1;
11880 cased = 1;
11881 }
11882 else if (Py_UNICODE_ISLOWER(ch)) {
11883 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011884 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011885 previous_is_cased = 1;
11886 cased = 1;
11887 }
11888 else
11889 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011891 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892}
11893
INADA Naoki3ae20562017-01-16 20:41:20 +090011894/*[clinic input]
11895str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
INADA Naoki3ae20562017-01-16 20:41:20 +090011897Return True if the string is a whitespace string, False otherwise.
11898
11899A string is whitespace if all characters in the string are whitespace and there
11900is at least one character in the string.
11901[clinic start generated code]*/
11902
11903static PyObject *
11904unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011905/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 Py_ssize_t i, length;
11908 int kind;
11909 void *data;
11910
11911 if (PyUnicode_READY(self) == -1)
11912 return NULL;
11913 length = PyUnicode_GET_LENGTH(self);
11914 kind = PyUnicode_KIND(self);
11915 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 if (length == 1)
11919 return PyBool_FromLong(
11920 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011922 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011924 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 for (i = 0; i < length; i++) {
11927 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011928 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011929 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011931 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932}
11933
INADA Naoki3ae20562017-01-16 20:41:20 +090011934/*[clinic input]
11935str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011936
INADA Naoki3ae20562017-01-16 20:41:20 +090011937Return True if the string is an alphabetic string, False otherwise.
11938
11939A string is alphabetic if all characters in the string are alphabetic and there
11940is at least one character in the string.
11941[clinic start generated code]*/
11942
11943static PyObject *
11944unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011945/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 Py_ssize_t i, length;
11948 int kind;
11949 void *data;
11950
11951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 length = PyUnicode_GET_LENGTH(self);
11954 kind = PyUnicode_KIND(self);
11955 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011956
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011957 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (length == 1)
11959 return PyBool_FromLong(
11960 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961
11962 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 for (i = 0; i < length; i++) {
11967 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011968 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011970 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971}
11972
INADA Naoki3ae20562017-01-16 20:41:20 +090011973/*[clinic input]
11974str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011975
INADA Naoki3ae20562017-01-16 20:41:20 +090011976Return True if the string is an alpha-numeric string, False otherwise.
11977
11978A string is alpha-numeric if all characters in the string are alpha-numeric and
11979there is at least one character in the string.
11980[clinic start generated code]*/
11981
11982static PyObject *
11983unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011984/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 int kind;
11987 void *data;
11988 Py_ssize_t len, i;
11989
11990 if (PyUnicode_READY(self) == -1)
11991 return NULL;
11992
11993 kind = PyUnicode_KIND(self);
11994 data = PyUnicode_DATA(self);
11995 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011996
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011997 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 if (len == 1) {
11999 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12000 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12001 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002
12003 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012005 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 for (i = 0; i < len; i++) {
12008 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012009 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012010 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012012 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012013}
12014
INADA Naoki3ae20562017-01-16 20:41:20 +090012015/*[clinic input]
12016str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017
INADA Naoki3ae20562017-01-16 20:41:20 +090012018Return True if the string is a decimal string, False otherwise.
12019
12020A string is a decimal string if all characters in the string are decimal and
12021there is at least one character in the string.
12022[clinic start generated code]*/
12023
12024static PyObject *
12025unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012026/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 Py_ssize_t i, length;
12029 int kind;
12030 void *data;
12031
12032 if (PyUnicode_READY(self) == -1)
12033 return NULL;
12034 length = PyUnicode_GET_LENGTH(self);
12035 kind = PyUnicode_KIND(self);
12036 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (length == 1)
12040 return PyBool_FromLong(
12041 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012043 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012045 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 for (i = 0; i < length; i++) {
12048 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012049 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012051 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052}
12053
INADA Naoki3ae20562017-01-16 20:41:20 +090012054/*[clinic input]
12055str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
INADA Naoki3ae20562017-01-16 20:41:20 +090012057Return True if the string is a digit string, False otherwise.
12058
12059A string is a digit string if all characters in the string are digits and there
12060is at least one character in the string.
12061[clinic start generated code]*/
12062
12063static PyObject *
12064unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012065/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 Py_ssize_t i, length;
12068 int kind;
12069 void *data;
12070
12071 if (PyUnicode_READY(self) == -1)
12072 return NULL;
12073 length = PyUnicode_GET_LENGTH(self);
12074 kind = PyUnicode_KIND(self);
12075 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if (length == 1) {
12079 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12080 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012083 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012085 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 for (i = 0; i < length; i++) {
12088 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012089 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012091 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092}
12093
INADA Naoki3ae20562017-01-16 20:41:20 +090012094/*[clinic input]
12095str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096
INADA Naoki3ae20562017-01-16 20:41:20 +090012097Return True if the string is a numeric string, False otherwise.
12098
12099A string is numeric if all characters in the string are numeric and there is at
12100least one character in the string.
12101[clinic start generated code]*/
12102
12103static PyObject *
12104unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012105/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 Py_ssize_t i, length;
12108 int kind;
12109 void *data;
12110
12111 if (PyUnicode_READY(self) == -1)
12112 return NULL;
12113 length = PyUnicode_GET_LENGTH(self);
12114 kind = PyUnicode_KIND(self);
12115 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 if (length == 1)
12119 return PyBool_FromLong(
12120 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012122 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012124 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 for (i = 0; i < length; i++) {
12127 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012128 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012130 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131}
12132
Martin v. Löwis47383402007-08-15 07:32:56 +000012133int
12134PyUnicode_IsIdentifier(PyObject *self)
12135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 int kind;
12137 void *data;
12138 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012139 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (PyUnicode_READY(self) == -1) {
12142 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 }
12145
12146 /* Special case for empty strings */
12147 if (PyUnicode_GET_LENGTH(self) == 0)
12148 return 0;
12149 kind = PyUnicode_KIND(self);
12150 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012151
12152 /* PEP 3131 says that the first character must be in
12153 XID_Start and subsequent characters in XID_Continue,
12154 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012155 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012156 letters, digits, underscore). However, given the current
12157 definition of XID_Start and XID_Continue, it is sufficient
12158 to check just for these, except that _ must be allowed
12159 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012161 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012162 return 0;
12163
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012164 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012167 return 1;
12168}
12169
INADA Naoki3ae20562017-01-16 20:41:20 +090012170/*[clinic input]
12171str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012172
INADA Naoki3ae20562017-01-16 20:41:20 +090012173Return True if the string is a valid Python identifier, False otherwise.
12174
12175Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12176"class".
12177[clinic start generated code]*/
12178
12179static PyObject *
12180unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012181/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012182{
12183 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12184}
12185
INADA Naoki3ae20562017-01-16 20:41:20 +090012186/*[clinic input]
12187str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012188
INADA Naoki3ae20562017-01-16 20:41:20 +090012189Return True if the string is printable, False otherwise.
12190
12191A string is printable if all of its characters are considered printable in
12192repr() or if it is empty.
12193[clinic start generated code]*/
12194
12195static PyObject *
12196unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012197/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 Py_ssize_t i, length;
12200 int kind;
12201 void *data;
12202
12203 if (PyUnicode_READY(self) == -1)
12204 return NULL;
12205 length = PyUnicode_GET_LENGTH(self);
12206 kind = PyUnicode_KIND(self);
12207 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012208
12209 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (length == 1)
12211 return PyBool_FromLong(
12212 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 for (i = 0; i < length; i++) {
12215 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012216 Py_RETURN_FALSE;
12217 }
12218 }
12219 Py_RETURN_TRUE;
12220}
12221
INADA Naoki3ae20562017-01-16 20:41:20 +090012222/*[clinic input]
12223str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
INADA Naoki3ae20562017-01-16 20:41:20 +090012225 iterable: object
12226 /
12227
12228Concatenate any number of strings.
12229
Martin Panter91a88662017-01-24 00:30:06 +000012230The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012231The result is returned as a new string.
12232
12233Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12234[clinic start generated code]*/
12235
12236static PyObject *
12237unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012238/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
INADA Naoki3ae20562017-01-16 20:41:20 +090012240 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241}
12242
Martin v. Löwis18e16552006-02-15 17:27:45 +000012243static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012244unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 if (PyUnicode_READY(self) == -1)
12247 return -1;
12248 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249}
12250
INADA Naoki3ae20562017-01-16 20:41:20 +090012251/*[clinic input]
12252str.ljust as unicode_ljust
12253
12254 width: Py_ssize_t
12255 fillchar: Py_UCS4 = ' '
12256 /
12257
12258Return a left-justified string of length width.
12259
12260Padding is done using the specified fill character (default is a space).
12261[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
12263static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012264unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12265/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012267 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
Victor Stinnerc4b49542011-12-11 22:44:26 +010012270 if (PyUnicode_GET_LENGTH(self) >= width)
12271 return unicode_result_unchanged(self);
12272
12273 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274}
12275
INADA Naoki3ae20562017-01-16 20:41:20 +090012276/*[clinic input]
12277str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
INADA Naoki3ae20562017-01-16 20:41:20 +090012279Return a copy of the string converted to lowercase.
12280[clinic start generated code]*/
12281
12282static PyObject *
12283unicode_lower_impl(PyObject *self)
12284/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012286 if (PyUnicode_READY(self) == -1)
12287 return NULL;
12288 if (PyUnicode_IS_ASCII(self))
12289 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012290 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291}
12292
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293#define LEFTSTRIP 0
12294#define RIGHTSTRIP 1
12295#define BOTHSTRIP 2
12296
12297/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012298static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299
INADA Naoki3ae20562017-01-16 20:41:20 +090012300#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012301
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302/* externally visible for str.strip(unicode) */
12303PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012304_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 void *data;
12307 int kind;
12308 Py_ssize_t i, j, len;
12309 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012310 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12313 return NULL;
12314
12315 kind = PyUnicode_KIND(self);
12316 data = PyUnicode_DATA(self);
12317 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012318 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12320 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012321 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012322
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 i = 0;
12324 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012325 while (i < len) {
12326 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12327 if (!BLOOM(sepmask, ch))
12328 break;
12329 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12330 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 i++;
12332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 j = len;
12336 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012337 j--;
12338 while (j >= i) {
12339 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12340 if (!BLOOM(sepmask, ch))
12341 break;
12342 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12343 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012345 }
12346
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349
Victor Stinner7931d9a2011-11-04 00:22:48 +010012350 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351}
12352
12353PyObject*
12354PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12355{
12356 unsigned char *data;
12357 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012358 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359
Victor Stinnerde636f32011-10-01 03:55:54 +020012360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
12362
Victor Stinner684d5fd2012-05-03 02:32:34 +020012363 length = PyUnicode_GET_LENGTH(self);
12364 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012365
Victor Stinner684d5fd2012-05-03 02:32:34 +020012366 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012367 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368
Victor Stinnerde636f32011-10-01 03:55:54 +020012369 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012370 PyErr_SetString(PyExc_IndexError, "string index out of range");
12371 return NULL;
12372 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012373 if (start >= length || end < start)
12374 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012375
Victor Stinner684d5fd2012-05-03 02:32:34 +020012376 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012377 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012378 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012379 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012380 }
12381 else {
12382 kind = PyUnicode_KIND(self);
12383 data = PyUnicode_1BYTE_DATA(self);
12384 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012385 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012386 length);
12387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389
12390static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012391do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 Py_ssize_t len, i, j;
12394
12395 if (PyUnicode_READY(self) == -1)
12396 return NULL;
12397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012399
Victor Stinnercc7af722013-04-09 22:39:24 +020012400 if (PyUnicode_IS_ASCII(self)) {
12401 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12402
12403 i = 0;
12404 if (striptype != RIGHTSTRIP) {
12405 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012406 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012407 if (!_Py_ascii_whitespace[ch])
12408 break;
12409 i++;
12410 }
12411 }
12412
12413 j = len;
12414 if (striptype != LEFTSTRIP) {
12415 j--;
12416 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012417 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012418 if (!_Py_ascii_whitespace[ch])
12419 break;
12420 j--;
12421 }
12422 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 }
12424 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012425 else {
12426 int kind = PyUnicode_KIND(self);
12427 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012428
Victor Stinnercc7af722013-04-09 22:39:24 +020012429 i = 0;
12430 if (striptype != RIGHTSTRIP) {
12431 while (i < len) {
12432 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12433 if (!Py_UNICODE_ISSPACE(ch))
12434 break;
12435 i++;
12436 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012437 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012438
12439 j = len;
12440 if (striptype != LEFTSTRIP) {
12441 j--;
12442 while (j >= i) {
12443 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12444 if (!Py_UNICODE_ISSPACE(ch))
12445 break;
12446 j--;
12447 }
12448 j++;
12449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012451
Victor Stinner7931d9a2011-11-04 00:22:48 +010012452 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453}
12454
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455
12456static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012457do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 if (sep != NULL && sep != Py_None) {
12460 if (PyUnicode_Check(sep))
12461 return _PyUnicode_XStrip(self, striptype, sep);
12462 else {
12463 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 "%s arg must be None or str",
12465 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466 return NULL;
12467 }
12468 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012469
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012471}
12472
12473
INADA Naoki3ae20562017-01-16 20:41:20 +090012474/*[clinic input]
12475str.strip as unicode_strip
12476
12477 chars: object = None
12478 /
12479
Victor Stinner0c4a8282017-01-17 02:21:47 +010012480Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012481
12482If chars is given and not None, remove characters in chars instead.
12483[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012484
12485static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012486unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012487/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488{
INADA Naoki3ae20562017-01-16 20:41:20 +090012489 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012490}
12491
12492
INADA Naoki3ae20562017-01-16 20:41:20 +090012493/*[clinic input]
12494str.lstrip as unicode_lstrip
12495
12496 chars: object = NULL
12497 /
12498
12499Return a copy of the string with leading whitespace removed.
12500
12501If chars is given and not None, remove characters in chars instead.
12502[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012503
12504static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012505unicode_lstrip_impl(PyObject *self, PyObject *chars)
12506/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012507{
INADA Naoki3ae20562017-01-16 20:41:20 +090012508 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012509}
12510
12511
INADA Naoki3ae20562017-01-16 20:41:20 +090012512/*[clinic input]
12513str.rstrip as unicode_rstrip
12514
12515 chars: object = NULL
12516 /
12517
12518Return a copy of the string with trailing whitespace removed.
12519
12520If chars is given and not None, remove characters in chars instead.
12521[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012522
12523static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012524unicode_rstrip_impl(PyObject *self, PyObject *chars)
12525/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012526{
INADA Naoki3ae20562017-01-16 20:41:20 +090012527 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012528}
12529
12530
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012532unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012534 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
Serhiy Storchaka05997252013-01-26 12:14:02 +020012537 if (len < 1)
12538 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
Victor Stinnerc4b49542011-12-11 22:44:26 +010012540 /* no repeat, return original string */
12541 if (len == 1)
12542 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012543
Benjamin Petersonbac79492012-01-14 13:34:47 -050012544 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 return NULL;
12546
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012547 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012548 PyErr_SetString(PyExc_OverflowError,
12549 "repeated string is too long");
12550 return NULL;
12551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012553
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012554 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555 if (!u)
12556 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012557 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 if (PyUnicode_GET_LENGTH(str) == 1) {
12560 const int kind = PyUnicode_KIND(str);
12561 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012562 if (kind == PyUnicode_1BYTE_KIND) {
12563 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012564 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012565 }
12566 else if (kind == PyUnicode_2BYTE_KIND) {
12567 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012568 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012569 ucs2[n] = fill_char;
12570 } else {
12571 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12572 assert(kind == PyUnicode_4BYTE_KIND);
12573 for (n = 0; n < len; ++n)
12574 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 }
12577 else {
12578 /* number of characters copied this far */
12579 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012580 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012582 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012586 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012587 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589 }
12590
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012591 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012592 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593}
12594
Alexander Belopolsky40018472011-02-26 01:02:56 +000012595PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012596PyUnicode_Replace(PyObject *str,
12597 PyObject *substr,
12598 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012599 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012601 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12602 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012604 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605}
12606
INADA Naoki3ae20562017-01-16 20:41:20 +090012607/*[clinic input]
12608str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609
INADA Naoki3ae20562017-01-16 20:41:20 +090012610 old: unicode
12611 new: unicode
12612 count: Py_ssize_t = -1
12613 Maximum number of occurrences to replace.
12614 -1 (the default value) means replace all occurrences.
12615 /
12616
12617Return a copy with all occurrences of substring old replaced by new.
12618
12619If the optional argument count is given, only the first count occurrences are
12620replaced.
12621[clinic start generated code]*/
12622
12623static PyObject *
12624unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12625 Py_ssize_t count)
12626/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012628 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012630 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
Alexander Belopolsky40018472011-02-26 01:02:56 +000012633static PyObject *
12634unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012636 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 Py_ssize_t isize;
12638 Py_ssize_t osize, squote, dquote, i, o;
12639 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012640 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012644 return NULL;
12645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 isize = PyUnicode_GET_LENGTH(unicode);
12647 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 /* Compute length of output, quote characters, and
12650 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012651 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 max = 127;
12653 squote = dquote = 0;
12654 ikind = PyUnicode_KIND(unicode);
12655 for (i = 0; i < isize; i++) {
12656 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012657 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012659 case '\'': squote++; break;
12660 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 incr = 2;
12663 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 default:
12665 /* Fast-path ASCII */
12666 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012669 ;
12670 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012673 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012675 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012677 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012679 if (osize > PY_SSIZE_T_MAX - incr) {
12680 PyErr_SetString(PyExc_OverflowError,
12681 "string is too long to generate repr");
12682 return NULL;
12683 }
12684 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 }
12686
12687 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012688 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012690 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 if (dquote)
12692 /* Both squote and dquote present. Use squote,
12693 and escape them */
12694 osize += squote;
12695 else
12696 quote = '"';
12697 }
Victor Stinner55c08782013-04-14 18:45:39 +020012698 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699
12700 repr = PyUnicode_New(osize, max);
12701 if (repr == NULL)
12702 return NULL;
12703 okind = PyUnicode_KIND(repr);
12704 odata = PyUnicode_DATA(repr);
12705
12706 PyUnicode_WRITE(okind, odata, 0, quote);
12707 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012708 if (unchanged) {
12709 _PyUnicode_FastCopyCharacters(repr, 1,
12710 unicode, 0,
12711 isize);
12712 }
12713 else {
12714 for (i = 0, o = 1; i < isize; i++) {
12715 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716
Victor Stinner55c08782013-04-14 18:45:39 +020012717 /* Escape quotes and backslashes */
12718 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012719 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012721 continue;
12722 }
12723
12724 /* Map special whitespace to '\t', \n', '\r' */
12725 if (ch == '\t') {
12726 PyUnicode_WRITE(okind, odata, o++, '\\');
12727 PyUnicode_WRITE(okind, odata, o++, 't');
12728 }
12729 else if (ch == '\n') {
12730 PyUnicode_WRITE(okind, odata, o++, '\\');
12731 PyUnicode_WRITE(okind, odata, o++, 'n');
12732 }
12733 else if (ch == '\r') {
12734 PyUnicode_WRITE(okind, odata, o++, '\\');
12735 PyUnicode_WRITE(okind, odata, o++, 'r');
12736 }
12737
12738 /* Map non-printable US ASCII to '\xhh' */
12739 else if (ch < ' ' || ch == 0x7F) {
12740 PyUnicode_WRITE(okind, odata, o++, '\\');
12741 PyUnicode_WRITE(okind, odata, o++, 'x');
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12744 }
12745
12746 /* Copy ASCII characters as-is */
12747 else if (ch < 0x7F) {
12748 PyUnicode_WRITE(okind, odata, o++, ch);
12749 }
12750
12751 /* Non-ASCII characters */
12752 else {
12753 /* Map Unicode whitespace and control characters
12754 (categories Z* and C* except ASCII space)
12755 */
12756 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12757 PyUnicode_WRITE(okind, odata, o++, '\\');
12758 /* Map 8-bit characters to '\xhh' */
12759 if (ch <= 0xff) {
12760 PyUnicode_WRITE(okind, odata, o++, 'x');
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12763 }
12764 /* Map 16-bit characters to '\uxxxx' */
12765 else if (ch <= 0xffff) {
12766 PyUnicode_WRITE(okind, odata, o++, 'u');
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12771 }
12772 /* Map 21-bit characters to '\U00xxxxxx' */
12773 else {
12774 PyUnicode_WRITE(okind, odata, o++, 'U');
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12783 }
12784 }
12785 /* Copy characters as-is */
12786 else {
12787 PyUnicode_WRITE(okind, odata, o++, ch);
12788 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012789 }
12790 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012793 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012794 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795}
12796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012797PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799\n\
12800Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012801such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802arguments start and end are interpreted as in slice notation.\n\
12803\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012804Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
12806static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012809 /* initialize variables to prevent gcc warning */
12810 PyObject *substring = NULL;
12811 Py_ssize_t start = 0;
12812 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012815 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012818 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012821 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 if (result == -2)
12824 return NULL;
12825
Christian Heimes217cfd12007-12-02 14:31:20 +000012826 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827}
12828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012829PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012832Return the highest index in S where substring sub is found,\n\
12833such that sub is contained within S[start:end]. Optional\n\
12834arguments start and end are interpreted as in slice notation.\n\
12835\n\
12836Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
12838static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012841 /* initialize variables to prevent gcc warning */
12842 PyObject *substring = NULL;
12843 Py_ssize_t start = 0;
12844 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012845 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012847 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012850 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012853 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 if (result == -2)
12856 return NULL;
12857
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858 if (result < 0) {
12859 PyErr_SetString(PyExc_ValueError, "substring not found");
12860 return NULL;
12861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862
Christian Heimes217cfd12007-12-02 14:31:20 +000012863 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864}
12865
INADA Naoki3ae20562017-01-16 20:41:20 +090012866/*[clinic input]
12867str.rjust as unicode_rjust
12868
12869 width: Py_ssize_t
12870 fillchar: Py_UCS4 = ' '
12871 /
12872
12873Return a right-justified string of length width.
12874
12875Padding is done using the specified fill character (default is a space).
12876[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
12878static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012879unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12880/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012882 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883 return NULL;
12884
Victor Stinnerc4b49542011-12-11 22:44:26 +010012885 if (PyUnicode_GET_LENGTH(self) >= width)
12886 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
Victor Stinnerc4b49542011-12-11 22:44:26 +010012888 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889}
12890
Alexander Belopolsky40018472011-02-26 01:02:56 +000012891PyObject *
12892PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012894 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012897 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898}
12899
INADA Naoki3ae20562017-01-16 20:41:20 +090012900/*[clinic input]
12901str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902
INADA Naoki3ae20562017-01-16 20:41:20 +090012903 sep: object = None
12904 The delimiter according which to split the string.
12905 None (the default value) means split according to any whitespace,
12906 and discard empty strings from the result.
12907 maxsplit: Py_ssize_t = -1
12908 Maximum number of splits to do.
12909 -1 (the default value) means no limit.
12910
12911Return a list of the words in the string, using sep as the delimiter string.
12912[clinic start generated code]*/
12913
12914static PyObject *
12915unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12916/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917{
INADA Naoki3ae20562017-01-16 20:41:20 +090012918 if (sep == Py_None)
12919 return split(self, NULL, maxsplit);
12920 if (PyUnicode_Check(sep))
12921 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012922
12923 PyErr_Format(PyExc_TypeError,
12924 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012925 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927}
12928
Thomas Wouters477c8d52006-05-27 19:21:47 +000012929PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012930PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012931{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012932 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012933 int kind1, kind2;
12934 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012937 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012939
Victor Stinner14f8f022011-10-05 20:58:25 +020012940 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 len1 = PyUnicode_GET_LENGTH(str_obj);
12943 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012944 if (kind1 < kind2 || len1 < len2) {
12945 _Py_INCREF_UNICODE_EMPTY();
12946 if (!unicode_empty)
12947 out = NULL;
12948 else {
12949 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12950 Py_DECREF(unicode_empty);
12951 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012952 return out;
12953 }
12954 buf1 = PyUnicode_DATA(str_obj);
12955 buf2 = PyUnicode_DATA(sep_obj);
12956 if (kind2 != kind1) {
12957 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12958 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012959 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012962 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012964 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12965 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966 else
12967 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 break;
12969 case PyUnicode_2BYTE_KIND:
12970 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971 break;
12972 case PyUnicode_4BYTE_KIND:
12973 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12974 break;
12975 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012976 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012979 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981
12982 return out;
12983}
12984
12985
12986PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012987PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012990 int kind1, kind2;
12991 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012994 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012997 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 len1 = PyUnicode_GET_LENGTH(str_obj);
13000 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013001 if (kind1 < kind2 || len1 < len2) {
13002 _Py_INCREF_UNICODE_EMPTY();
13003 if (!unicode_empty)
13004 out = NULL;
13005 else {
13006 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13007 Py_DECREF(unicode_empty);
13008 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013009 return out;
13010 }
13011 buf1 = PyUnicode_DATA(str_obj);
13012 buf2 = PyUnicode_DATA(sep_obj);
13013 if (kind2 != kind1) {
13014 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13015 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013016 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013019 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013021 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13022 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13023 else
13024 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 break;
13026 case PyUnicode_2BYTE_KIND:
13027 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028 break;
13029 case PyUnicode_4BYTE_KIND:
13030 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13031 break;
13032 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013033 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013036 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013038
13039 return out;
13040}
13041
INADA Naoki3ae20562017-01-16 20:41:20 +090013042/*[clinic input]
13043str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044
INADA Naoki3ae20562017-01-16 20:41:20 +090013045 sep: object
13046 /
13047
13048Partition the string into three parts using the given separator.
13049
13050This will search for the separator in the string. If the separator is found,
13051returns a 3-tuple containing the part before the separator, the separator
13052itself, and the part after it.
13053
13054If the separator is not found, returns a 3-tuple containing the original string
13055and two empty strings.
13056[clinic start generated code]*/
13057
13058static PyObject *
13059unicode_partition(PyObject *self, PyObject *sep)
13060/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013061{
INADA Naoki3ae20562017-01-16 20:41:20 +090013062 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063}
13064
INADA Naoki3ae20562017-01-16 20:41:20 +090013065/*[clinic input]
13066str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067
INADA Naoki3ae20562017-01-16 20:41:20 +090013068Partition the string into three parts using the given separator.
13069
13070This will search for the separator in the string, starting and the end. If
13071the separator is found, returns a 3-tuple containing the part before the
13072separator, the separator itself, and the part after it.
13073
13074If the separator is not found, returns a 3-tuple containing two empty strings
13075and the original string.
13076[clinic start generated code]*/
13077
13078static PyObject *
13079unicode_rpartition(PyObject *self, PyObject *sep)
13080/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013081{
INADA Naoki3ae20562017-01-16 20:41:20 +090013082 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013083}
13084
Alexander Belopolsky40018472011-02-26 01:02:56 +000013085PyObject *
13086PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013087{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013088 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013090
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013091 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013092}
13093
INADA Naoki3ae20562017-01-16 20:41:20 +090013094/*[clinic input]
13095str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013096
INADA Naoki3ae20562017-01-16 20:41:20 +090013097Return a list of the words in the string, using sep as the delimiter string.
13098
13099Splits are done starting at the end of the string and working to the front.
13100[clinic start generated code]*/
13101
13102static PyObject *
13103unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13104/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013105{
INADA Naoki3ae20562017-01-16 20:41:20 +090013106 if (sep == Py_None)
13107 return rsplit(self, NULL, maxsplit);
13108 if (PyUnicode_Check(sep))
13109 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013110
13111 PyErr_Format(PyExc_TypeError,
13112 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013113 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013114 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013115}
13116
INADA Naoki3ae20562017-01-16 20:41:20 +090013117/*[clinic input]
13118str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013120 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013121
13122Return a list of the lines in the string, breaking at line boundaries.
13123
13124Line breaks are not included in the resulting list unless keepends is given and
13125true.
13126[clinic start generated code]*/
13127
13128static PyObject *
13129unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013130/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013132 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133}
13134
13135static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013136PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013138 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139}
13140
INADA Naoki3ae20562017-01-16 20:41:20 +090013141/*[clinic input]
13142str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143
INADA Naoki3ae20562017-01-16 20:41:20 +090013144Convert uppercase characters to lowercase and lowercase characters to uppercase.
13145[clinic start generated code]*/
13146
13147static PyObject *
13148unicode_swapcase_impl(PyObject *self)
13149/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013151 if (PyUnicode_READY(self) == -1)
13152 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013153 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154}
13155
Larry Hastings61272b72014-01-07 12:41:53 -080013156/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013157
Larry Hastings31826802013-10-19 00:09:25 -070013158@staticmethod
13159str.maketrans as unicode_maketrans
13160
13161 x: object
13162
13163 y: unicode=NULL
13164
13165 z: unicode=NULL
13166
13167 /
13168
13169Return a translation table usable for str.translate().
13170
13171If there is only one argument, it must be a dictionary mapping Unicode
13172ordinals (integers) or characters to Unicode ordinals, strings or None.
13173Character keys will be then converted to ordinals.
13174If there are two arguments, they must be strings of equal length, and
13175in the resulting dictionary, each character in x will be mapped to the
13176character at the same position in y. If there is a third argument, it
13177must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013178[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013179
Larry Hastings31826802013-10-19 00:09:25 -070013180static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013181unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013182/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013183{
Georg Brandlceee0772007-11-27 23:48:05 +000013184 PyObject *new = NULL, *key, *value;
13185 Py_ssize_t i = 0;
13186 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187
Georg Brandlceee0772007-11-27 23:48:05 +000013188 new = PyDict_New();
13189 if (!new)
13190 return NULL;
13191 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 int x_kind, y_kind, z_kind;
13193 void *x_data, *y_data, *z_data;
13194
Georg Brandlceee0772007-11-27 23:48:05 +000013195 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013196 if (!PyUnicode_Check(x)) {
13197 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13198 "be a string if there is a second argument");
13199 goto err;
13200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013202 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13203 "arguments must have equal length");
13204 goto err;
13205 }
13206 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 x_kind = PyUnicode_KIND(x);
13208 y_kind = PyUnicode_KIND(y);
13209 x_data = PyUnicode_DATA(x);
13210 y_data = PyUnicode_DATA(y);
13211 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13212 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013213 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013214 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013215 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013216 if (!value) {
13217 Py_DECREF(key);
13218 goto err;
13219 }
Georg Brandlceee0772007-11-27 23:48:05 +000013220 res = PyDict_SetItem(new, key, value);
13221 Py_DECREF(key);
13222 Py_DECREF(value);
13223 if (res < 0)
13224 goto err;
13225 }
13226 /* create entries for deleting chars in z */
13227 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 z_kind = PyUnicode_KIND(z);
13229 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013230 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013232 if (!key)
13233 goto err;
13234 res = PyDict_SetItem(new, key, Py_None);
13235 Py_DECREF(key);
13236 if (res < 0)
13237 goto err;
13238 }
13239 }
13240 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 int kind;
13242 void *data;
13243
Georg Brandlceee0772007-11-27 23:48:05 +000013244 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013245 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013246 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13247 "to maketrans it must be a dict");
13248 goto err;
13249 }
13250 /* copy entries into the new dict, converting string keys to int keys */
13251 while (PyDict_Next(x, &i, &key, &value)) {
13252 if (PyUnicode_Check(key)) {
13253 /* convert string keys to integer keys */
13254 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013255 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013256 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13257 "table must be of length 1");
13258 goto err;
13259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 kind = PyUnicode_KIND(key);
13261 data = PyUnicode_DATA(key);
13262 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013263 if (!newkey)
13264 goto err;
13265 res = PyDict_SetItem(new, newkey, value);
13266 Py_DECREF(newkey);
13267 if (res < 0)
13268 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013269 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013270 /* just keep integer keys */
13271 if (PyDict_SetItem(new, key, value) < 0)
13272 goto err;
13273 } else {
13274 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13275 "be strings or integers");
13276 goto err;
13277 }
13278 }
13279 }
13280 return new;
13281 err:
13282 Py_DECREF(new);
13283 return NULL;
13284}
13285
INADA Naoki3ae20562017-01-16 20:41:20 +090013286/*[clinic input]
13287str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288
INADA Naoki3ae20562017-01-16 20:41:20 +090013289 table: object
13290 Translation table, which must be a mapping of Unicode ordinals to
13291 Unicode ordinals, strings, or None.
13292 /
13293
13294Replace each character in the string using the given translation table.
13295
13296The table must implement lookup/indexing via __getitem__, for instance a
13297dictionary or list. If this operation raises LookupError, the character is
13298left untouched. Characters mapped to None are deleted.
13299[clinic start generated code]*/
13300
13301static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013303/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306}
13307
INADA Naoki3ae20562017-01-16 20:41:20 +090013308/*[clinic input]
13309str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310
INADA Naoki3ae20562017-01-16 20:41:20 +090013311Return a copy of the string converted to uppercase.
13312[clinic start generated code]*/
13313
13314static PyObject *
13315unicode_upper_impl(PyObject *self)
13316/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013318 if (PyUnicode_READY(self) == -1)
13319 return NULL;
13320 if (PyUnicode_IS_ASCII(self))
13321 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013322 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323}
13324
INADA Naoki3ae20562017-01-16 20:41:20 +090013325/*[clinic input]
13326str.zfill as unicode_zfill
13327
13328 width: Py_ssize_t
13329 /
13330
13331Pad a numeric string with zeros on the left, to fill a field of the given width.
13332
13333The string is never truncated.
13334[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335
13336static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013337unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013338/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013340 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013341 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 int kind;
13343 void *data;
13344 Py_UCS4 chr;
13345
Benjamin Petersonbac79492012-01-14 13:34:47 -050013346 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348
Victor Stinnerc4b49542011-12-11 22:44:26 +010013349 if (PyUnicode_GET_LENGTH(self) >= width)
13350 return unicode_result_unchanged(self);
13351
13352 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353
13354 u = pad(self, fill, 0, '0');
13355
Walter Dörwald068325e2002-04-15 13:36:47 +000013356 if (u == NULL)
13357 return NULL;
13358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359 kind = PyUnicode_KIND(u);
13360 data = PyUnicode_DATA(u);
13361 chr = PyUnicode_READ(kind, data, fill);
13362
13363 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 PyUnicode_WRITE(kind, data, 0, chr);
13366 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367 }
13368
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013369 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013370 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
13373#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013374static PyObject *
13375unicode__decimal2ascii(PyObject *self)
13376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013378}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379#endif
13380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013381PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013384Return True if S starts with the specified prefix, False otherwise.\n\
13385With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386With optional end, stop comparing S at that position.\n\
13387prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388
13389static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013390unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013393 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013394 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013395 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013396 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398
Jesus Ceaac451502011-04-20 17:09:23 +020013399 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 if (PyTuple_Check(subobj)) {
13402 Py_ssize_t i;
13403 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013404 substring = PyTuple_GET_ITEM(subobj, i);
13405 if (!PyUnicode_Check(substring)) {
13406 PyErr_Format(PyExc_TypeError,
13407 "tuple for startswith must only contain str, "
13408 "not %.100s",
13409 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013410 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013411 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013413 if (result == -1)
13414 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013415 if (result) {
13416 Py_RETURN_TRUE;
13417 }
13418 }
13419 /* nothing matched */
13420 Py_RETURN_FALSE;
13421 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013422 if (!PyUnicode_Check(subobj)) {
13423 PyErr_Format(PyExc_TypeError,
13424 "startswith first arg must be str or "
13425 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013427 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013428 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013429 if (result == -1)
13430 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013431 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432}
13433
13434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013435PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013438Return True if S ends with the specified suffix, False otherwise.\n\
13439With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440With optional end, stop comparing S at that position.\n\
13441suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442
13443static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013444unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013447 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013448 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013449 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013450 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013452
Jesus Ceaac451502011-04-20 17:09:23 +020013453 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013455 if (PyTuple_Check(subobj)) {
13456 Py_ssize_t i;
13457 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013458 substring = PyTuple_GET_ITEM(subobj, i);
13459 if (!PyUnicode_Check(substring)) {
13460 PyErr_Format(PyExc_TypeError,
13461 "tuple for endswith must only contain str, "
13462 "not %.100s",
13463 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013465 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013466 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013467 if (result == -1)
13468 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013469 if (result) {
13470 Py_RETURN_TRUE;
13471 }
13472 }
13473 Py_RETURN_FALSE;
13474 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013475 if (!PyUnicode_Check(subobj)) {
13476 PyErr_Format(PyExc_TypeError,
13477 "endswith first arg must be str or "
13478 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013480 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013481 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013482 if (result == -1)
13483 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013484 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485}
13486
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013487static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013488_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013489{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013490 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13491 writer->data = PyUnicode_DATA(writer->buffer);
13492
13493 if (!writer->readonly) {
13494 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013495 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013496 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013497 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013498 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13499 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13500 writer->kind = PyUnicode_WCHAR_KIND;
13501 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13502
Victor Stinner8f674cc2013-04-17 23:02:17 +020013503 /* Copy-on-write mode: set buffer size to 0 so
13504 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13505 * next write. */
13506 writer->size = 0;
13507 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013508}
13509
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013511_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013512{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013514
13515 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013517
13518 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13519 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13520 writer->kind = PyUnicode_WCHAR_KIND;
13521 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013522}
13523
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524int
13525_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13526 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013527{
13528 Py_ssize_t newlen;
13529 PyObject *newbuffer;
13530
Victor Stinner2740e462016-09-06 16:58:36 -070013531 assert(maxchar <= MAX_UNICODE);
13532
Victor Stinnerca9381e2015-09-22 00:58:32 +020013533 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013534 assert((maxchar > writer->maxchar && length >= 0)
13535 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536
Victor Stinner202fdca2012-05-07 12:47:02 +020013537 if (length > PY_SSIZE_T_MAX - writer->pos) {
13538 PyErr_NoMemory();
13539 return -1;
13540 }
13541 newlen = writer->pos + length;
13542
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013543 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013544
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013546 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013547 if (writer->overallocate
13548 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13549 /* overallocate to limit the number of realloc() */
13550 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013551 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552 if (newlen < writer->min_length)
13553 newlen = writer->min_length;
13554
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 writer->buffer = PyUnicode_New(newlen, maxchar);
13556 if (writer->buffer == NULL)
13557 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013559 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013560 if (writer->overallocate
13561 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13562 /* overallocate to limit the number of realloc() */
13563 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013565 if (newlen < writer->min_length)
13566 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013568 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013569 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013570 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013571 newbuffer = PyUnicode_New(newlen, maxchar);
13572 if (newbuffer == NULL)
13573 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013574 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13575 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013577 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013578 }
13579 else {
13580 newbuffer = resize_compact(writer->buffer, newlen);
13581 if (newbuffer == NULL)
13582 return -1;
13583 }
13584 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 }
13586 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013587 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588 newbuffer = PyUnicode_New(writer->size, maxchar);
13589 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013590 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013591 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13592 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013593 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013594 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013595 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013596 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013597
13598#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013599}
13600
Victor Stinnerca9381e2015-09-22 00:58:32 +020013601int
13602_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13603 enum PyUnicode_Kind kind)
13604{
13605 Py_UCS4 maxchar;
13606
13607 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13608 assert(writer->kind < kind);
13609
13610 switch (kind)
13611 {
13612 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13613 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13614 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13615 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013616 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013617 }
13618
13619 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13620}
13621
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013622static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013623_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013624{
Victor Stinner2740e462016-09-06 16:58:36 -070013625 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013626 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13627 return -1;
13628 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13629 writer->pos++;
13630 return 0;
13631}
13632
13633int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013634_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13635{
13636 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13637}
13638
13639int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13641{
13642 Py_UCS4 maxchar;
13643 Py_ssize_t len;
13644
13645 if (PyUnicode_READY(str) == -1)
13646 return -1;
13647 len = PyUnicode_GET_LENGTH(str);
13648 if (len == 0)
13649 return 0;
13650 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13651 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013652 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013653 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013654 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013655 Py_INCREF(str);
13656 writer->buffer = str;
13657 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658 writer->pos += len;
13659 return 0;
13660 }
13661 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13662 return -1;
13663 }
13664 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13665 str, 0, len);
13666 writer->pos += len;
13667 return 0;
13668}
13669
Victor Stinnere215d962012-10-06 23:03:36 +020013670int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013671_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13672 Py_ssize_t start, Py_ssize_t end)
13673{
13674 Py_UCS4 maxchar;
13675 Py_ssize_t len;
13676
13677 if (PyUnicode_READY(str) == -1)
13678 return -1;
13679
13680 assert(0 <= start);
13681 assert(end <= PyUnicode_GET_LENGTH(str));
13682 assert(start <= end);
13683
13684 if (end == 0)
13685 return 0;
13686
13687 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13688 return _PyUnicodeWriter_WriteStr(writer, str);
13689
13690 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13691 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13692 else
13693 maxchar = writer->maxchar;
13694 len = end - start;
13695
13696 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13697 return -1;
13698
13699 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13700 str, start, len);
13701 writer->pos += len;
13702 return 0;
13703}
13704
13705int
Victor Stinner4a587072013-11-19 12:54:53 +010013706_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13707 const char *ascii, Py_ssize_t len)
13708{
13709 if (len == -1)
13710 len = strlen(ascii);
13711
13712 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13713
13714 if (writer->buffer == NULL && !writer->overallocate) {
13715 PyObject *str;
13716
13717 str = _PyUnicode_FromASCII(ascii, len);
13718 if (str == NULL)
13719 return -1;
13720
13721 writer->readonly = 1;
13722 writer->buffer = str;
13723 _PyUnicodeWriter_Update(writer);
13724 writer->pos += len;
13725 return 0;
13726 }
13727
13728 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13729 return -1;
13730
13731 switch (writer->kind)
13732 {
13733 case PyUnicode_1BYTE_KIND:
13734 {
13735 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13736 Py_UCS1 *data = writer->data;
13737
Christian Heimesf051e432016-09-13 20:22:02 +020013738 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013739 break;
13740 }
13741 case PyUnicode_2BYTE_KIND:
13742 {
13743 _PyUnicode_CONVERT_BYTES(
13744 Py_UCS1, Py_UCS2,
13745 ascii, ascii + len,
13746 (Py_UCS2 *)writer->data + writer->pos);
13747 break;
13748 }
13749 case PyUnicode_4BYTE_KIND:
13750 {
13751 _PyUnicode_CONVERT_BYTES(
13752 Py_UCS1, Py_UCS4,
13753 ascii, ascii + len,
13754 (Py_UCS4 *)writer->data + writer->pos);
13755 break;
13756 }
13757 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013758 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013759 }
13760
13761 writer->pos += len;
13762 return 0;
13763}
13764
13765int
13766_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13767 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013768{
13769 Py_UCS4 maxchar;
13770
13771 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13772 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13773 return -1;
13774 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13775 writer->pos += len;
13776 return 0;
13777}
13778
Victor Stinnerd3f08822012-05-29 12:57:52 +020013779PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013780_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013781{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013782 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013783
Victor Stinnerd3f08822012-05-29 12:57:52 +020013784 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013785 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013786 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013787 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013788
13789 str = writer->buffer;
13790 writer->buffer = NULL;
13791
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013792 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013793 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13794 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013795 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013796
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013797 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13798 PyObject *str2;
13799 str2 = resize_compact(str, writer->pos);
13800 if (str2 == NULL) {
13801 Py_DECREF(str);
13802 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013803 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013804 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013805 }
13806
Victor Stinner15a0bd32013-07-08 22:29:55 +020013807 assert(_PyUnicode_CheckConsistency(str, 1));
13808 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013809}
13810
Victor Stinnerd3f08822012-05-29 12:57:52 +020013811void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013812_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013813{
13814 Py_CLEAR(writer->buffer);
13815}
13816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013818
13819PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013821\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013822Return a formatted version of S, using substitutions from args and kwargs.\n\
13823The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013824
Eric Smith27bbca62010-11-04 17:06:58 +000013825PyDoc_STRVAR(format_map__doc__,
13826 "S.format_map(mapping) -> str\n\
13827\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013828Return a formatted version of S, using substitutions from mapping.\n\
13829The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013830
INADA Naoki3ae20562017-01-16 20:41:20 +090013831/*[clinic input]
13832str.__format__ as unicode___format__
13833
13834 format_spec: unicode
13835 /
13836
13837Return a formatted version of the string as described by format_spec.
13838[clinic start generated code]*/
13839
Eric Smith4a7d76d2008-05-30 18:10:19 +000013840static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013841unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013842/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013843{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 _PyUnicodeWriter writer;
13845 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013846
Victor Stinnerd3f08822012-05-29 12:57:52 +020013847 if (PyUnicode_READY(self) == -1)
13848 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013849 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13851 self, format_spec, 0,
13852 PyUnicode_GET_LENGTH(format_spec));
13853 if (ret == -1) {
13854 _PyUnicodeWriter_Dealloc(&writer);
13855 return NULL;
13856 }
13857 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013858}
13859
INADA Naoki3ae20562017-01-16 20:41:20 +090013860/*[clinic input]
13861str.__sizeof__ as unicode_sizeof
13862
13863Return the size of the string in memory, in bytes.
13864[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013865
13866static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013867unicode_sizeof_impl(PyObject *self)
13868/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013870 Py_ssize_t size;
13871
13872 /* If it's a compact object, account for base structure +
13873 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013874 if (PyUnicode_IS_COMPACT_ASCII(self))
13875 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13876 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013878 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013879 else {
13880 /* If it is a two-block object, account for base object, and
13881 for character block if present. */
13882 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 if (_PyUnicode_DATA_ANY(self))
13884 size += (PyUnicode_GET_LENGTH(self) + 1) *
13885 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 }
13887 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013888 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013889 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13890 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13891 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13892 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013893
13894 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013895}
13896
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013897static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013898unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013899{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013900 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 if (!copy)
13902 return NULL;
13903 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013904}
13905
Guido van Rossumd57fd912000-03-10 22:53:23 +000013906static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013907 UNICODE_ENCODE_METHODDEF
13908 UNICODE_REPLACE_METHODDEF
13909 UNICODE_SPLIT_METHODDEF
13910 UNICODE_RSPLIT_METHODDEF
13911 UNICODE_JOIN_METHODDEF
13912 UNICODE_CAPITALIZE_METHODDEF
13913 UNICODE_CASEFOLD_METHODDEF
13914 UNICODE_TITLE_METHODDEF
13915 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013917 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013919 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013921 UNICODE_LJUST_METHODDEF
13922 UNICODE_LOWER_METHODDEF
13923 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013924 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13925 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013926 UNICODE_RJUST_METHODDEF
13927 UNICODE_RSTRIP_METHODDEF
13928 UNICODE_RPARTITION_METHODDEF
13929 UNICODE_SPLITLINES_METHODDEF
13930 UNICODE_STRIP_METHODDEF
13931 UNICODE_SWAPCASE_METHODDEF
13932 UNICODE_TRANSLATE_METHODDEF
13933 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013934 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13935 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013936 UNICODE_ISLOWER_METHODDEF
13937 UNICODE_ISUPPER_METHODDEF
13938 UNICODE_ISTITLE_METHODDEF
13939 UNICODE_ISSPACE_METHODDEF
13940 UNICODE_ISDECIMAL_METHODDEF
13941 UNICODE_ISDIGIT_METHODDEF
13942 UNICODE_ISNUMERIC_METHODDEF
13943 UNICODE_ISALPHA_METHODDEF
13944 UNICODE_ISALNUM_METHODDEF
13945 UNICODE_ISIDENTIFIER_METHODDEF
13946 UNICODE_ISPRINTABLE_METHODDEF
13947 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013948 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013949 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013950 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013951 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013952 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013953#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013954 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013955 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956#endif
13957
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959 {NULL, NULL}
13960};
13961
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013962static PyObject *
13963unicode_mod(PyObject *v, PyObject *w)
13964{
Brian Curtindfc80e32011-08-10 20:28:54 -050013965 if (!PyUnicode_Check(v))
13966 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013968}
13969
13970static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 0, /*nb_add*/
13972 0, /*nb_subtract*/
13973 0, /*nb_multiply*/
13974 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013975};
13976
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 (lenfunc) unicode_length, /* sq_length */
13979 PyUnicode_Concat, /* sq_concat */
13980 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13981 (ssizeargfunc) unicode_getitem, /* sq_item */
13982 0, /* sq_slice */
13983 0, /* sq_ass_item */
13984 0, /* sq_ass_slice */
13985 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013986};
13987
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013988static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013989unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013991 if (PyUnicode_READY(self) == -1)
13992 return NULL;
13993
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013994 if (PyIndex_Check(item)) {
13995 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 if (i == -1 && PyErr_Occurred())
13997 return NULL;
13998 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013999 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014000 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014001 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014002 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014003 PyObject *result;
14004 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014005 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014006 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014007
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014008 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014009 return NULL;
14010 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014011 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14012 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014013
14014 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014015 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014016 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014017 slicelength == PyUnicode_GET_LENGTH(self)) {
14018 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014019 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014020 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014021 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014022 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014023 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 src_kind = PyUnicode_KIND(self);
14025 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014026 if (!PyUnicode_IS_ASCII(self)) {
14027 kind_limit = kind_maxchar_limit(src_kind);
14028 max_char = 0;
14029 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14030 ch = PyUnicode_READ(src_kind, src_data, cur);
14031 if (ch > max_char) {
14032 max_char = ch;
14033 if (max_char >= kind_limit)
14034 break;
14035 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014036 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014037 }
Victor Stinner55c99112011-10-13 01:17:06 +020014038 else
14039 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014040 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014041 if (result == NULL)
14042 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014043 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014044 dest_data = PyUnicode_DATA(result);
14045
14046 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014047 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14048 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014049 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014050 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014051 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014052 } else {
14053 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14054 return NULL;
14055 }
14056}
14057
14058static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 (lenfunc)unicode_length, /* mp_length */
14060 (binaryfunc)unicode_subscript, /* mp_subscript */
14061 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014062};
14063
Guido van Rossumd57fd912000-03-10 22:53:23 +000014064
Guido van Rossumd57fd912000-03-10 22:53:23 +000014065/* Helpers for PyUnicode_Format() */
14066
Victor Stinnera47082312012-10-04 02:19:54 +020014067struct unicode_formatter_t {
14068 PyObject *args;
14069 int args_owned;
14070 Py_ssize_t arglen, argidx;
14071 PyObject *dict;
14072
14073 enum PyUnicode_Kind fmtkind;
14074 Py_ssize_t fmtcnt, fmtpos;
14075 void *fmtdata;
14076 PyObject *fmtstr;
14077
14078 _PyUnicodeWriter writer;
14079};
14080
14081struct unicode_format_arg_t {
14082 Py_UCS4 ch;
14083 int flags;
14084 Py_ssize_t width;
14085 int prec;
14086 int sign;
14087};
14088
Guido van Rossumd57fd912000-03-10 22:53:23 +000014089static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014090unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014091{
Victor Stinnera47082312012-10-04 02:19:54 +020014092 Py_ssize_t argidx = ctx->argidx;
14093
14094 if (argidx < ctx->arglen) {
14095 ctx->argidx++;
14096 if (ctx->arglen < 0)
14097 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014098 else
Victor Stinnera47082312012-10-04 02:19:54 +020014099 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100 }
14101 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014102 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014103 return NULL;
14104}
14105
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014106/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014107
Victor Stinnera47082312012-10-04 02:19:54 +020014108/* Format a float into the writer if the writer is not NULL, or into *p_output
14109 otherwise.
14110
14111 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112static int
Victor Stinnera47082312012-10-04 02:19:54 +020014113formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14114 PyObject **p_output,
14115 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014116{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014117 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014120 int prec;
14121 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014122
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123 x = PyFloat_AsDouble(v);
14124 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014125 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014126
Victor Stinnera47082312012-10-04 02:19:54 +020014127 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014129 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014130
Victor Stinnera47082312012-10-04 02:19:54 +020014131 if (arg->flags & F_ALT)
14132 dtoa_flags = Py_DTSF_ALT;
14133 else
14134 dtoa_flags = 0;
14135 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014136 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014137 return -1;
14138 len = strlen(p);
14139 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014140 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014141 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014142 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014143 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014144 }
14145 else
14146 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014147 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014148 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014149}
14150
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151/* formatlong() emulates the format codes d, u, o, x and X, and
14152 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14153 * Python's regular ints.
14154 * Return value: a new PyUnicodeObject*, or NULL if error.
14155 * The output string is of the form
14156 * "-"? ("0x" | "0X")? digit+
14157 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14158 * set in flags. The case of hex digits will be correct,
14159 * There will be at least prec digits, zero-filled on the left if
14160 * necessary to get that many.
14161 * val object to be converted
14162 * flags bitmask of format flags; only F_ALT is looked at
14163 * prec minimum number of digits; 0-fill on left if needed
14164 * type a character in [duoxX]; u acts the same as d
14165 *
14166 * CAUTION: o, x and X conversions on regular ints can never
14167 * produce a '-' sign, but can for Python's unbounded ints.
14168 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014169PyObject *
14170_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014171{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014172 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014174 Py_ssize_t i;
14175 int sign; /* 1 if '-', else 0 */
14176 int len; /* number of characters */
14177 Py_ssize_t llen;
14178 int numdigits; /* len == numnondigits + numdigits */
14179 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014180
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 /* Avoid exceeding SSIZE_T_MAX */
14182 if (prec > INT_MAX-3) {
14183 PyErr_SetString(PyExc_OverflowError,
14184 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 }
14187
14188 assert(PyLong_Check(val));
14189
14190 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014191 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014192 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014193 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014194 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014195 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014196 /* int and int subclasses should print numerically when a numeric */
14197 /* format code is used (see issue18780) */
14198 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 break;
14200 case 'o':
14201 numnondigits = 2;
14202 result = PyNumber_ToBase(val, 8);
14203 break;
14204 case 'x':
14205 case 'X':
14206 numnondigits = 2;
14207 result = PyNumber_ToBase(val, 16);
14208 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014209 }
14210 if (!result)
14211 return NULL;
14212
14213 assert(unicode_modifiable(result));
14214 assert(PyUnicode_IS_READY(result));
14215 assert(PyUnicode_IS_ASCII(result));
14216
14217 /* To modify the string in-place, there can only be one reference. */
14218 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014219 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014220 PyErr_BadInternalCall();
14221 return NULL;
14222 }
14223 buf = PyUnicode_DATA(result);
14224 llen = PyUnicode_GET_LENGTH(result);
14225 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014226 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014227 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014228 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014229 return NULL;
14230 }
14231 len = (int)llen;
14232 sign = buf[0] == '-';
14233 numnondigits += sign;
14234 numdigits = len - numnondigits;
14235 assert(numdigits > 0);
14236
14237 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014238 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014239 (type == 'o' || type == 'x' || type == 'X'))) {
14240 assert(buf[sign] == '0');
14241 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14242 buf[sign+1] == 'o');
14243 numnondigits -= 2;
14244 buf += 2;
14245 len -= 2;
14246 if (sign)
14247 buf[0] = '-';
14248 assert(len == numnondigits + numdigits);
14249 assert(numdigits > 0);
14250 }
14251
14252 /* Fill with leading zeroes to meet minimum width. */
14253 if (prec > numdigits) {
14254 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14255 numnondigits + prec);
14256 char *b1;
14257 if (!r1) {
14258 Py_DECREF(result);
14259 return NULL;
14260 }
14261 b1 = PyBytes_AS_STRING(r1);
14262 for (i = 0; i < numnondigits; ++i)
14263 *b1++ = *buf++;
14264 for (i = 0; i < prec - numdigits; i++)
14265 *b1++ = '0';
14266 for (i = 0; i < numdigits; i++)
14267 *b1++ = *buf++;
14268 *b1 = '\0';
14269 Py_DECREF(result);
14270 result = r1;
14271 buf = PyBytes_AS_STRING(result);
14272 len = numnondigits + prec;
14273 }
14274
14275 /* Fix up case for hex conversions. */
14276 if (type == 'X') {
14277 /* Need to convert all lower case letters to upper case.
14278 and need to convert 0x to 0X (and -0x to -0X). */
14279 for (i = 0; i < len; i++)
14280 if (buf[i] >= 'a' && buf[i] <= 'x')
14281 buf[i] -= 'a'-'A';
14282 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283 if (!PyUnicode_Check(result)
14284 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014285 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014286 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014287 Py_DECREF(result);
14288 result = unicode;
14289 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290 else if (len != PyUnicode_GET_LENGTH(result)) {
14291 if (PyUnicode_Resize(&result, len) < 0)
14292 Py_CLEAR(result);
14293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014294 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014295}
14296
Ethan Furmandf3ed242014-01-05 06:50:30 -080014297/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014299 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014300 * -1 and raise an exception on error */
14301static int
Victor Stinnera47082312012-10-04 02:19:54 +020014302mainformatlong(PyObject *v,
14303 struct unicode_format_arg_t *arg,
14304 PyObject **p_output,
14305 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306{
14307 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014308 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014309
14310 if (!PyNumber_Check(v))
14311 goto wrongtype;
14312
Ethan Furman9ab74802014-03-21 06:38:46 -070014313 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014314 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014315 if (type == 'o' || type == 'x' || type == 'X') {
14316 iobj = PyNumber_Index(v);
14317 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014318 if (PyErr_ExceptionMatches(PyExc_TypeError))
14319 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014320 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014321 }
14322 }
14323 else {
14324 iobj = PyNumber_Long(v);
14325 if (iobj == NULL ) {
14326 if (PyErr_ExceptionMatches(PyExc_TypeError))
14327 goto wrongtype;
14328 return -1;
14329 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014330 }
14331 assert(PyLong_Check(iobj));
14332 }
14333 else {
14334 iobj = v;
14335 Py_INCREF(iobj);
14336 }
14337
14338 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014339 && arg->width == -1 && arg->prec == -1
14340 && !(arg->flags & (F_SIGN | F_BLANK))
14341 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014342 {
14343 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014344 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014345 int base;
14346
Victor Stinnera47082312012-10-04 02:19:54 +020014347 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 {
14349 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014350 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 case 'd':
14352 case 'i':
14353 case 'u':
14354 base = 10;
14355 break;
14356 case 'o':
14357 base = 8;
14358 break;
14359 case 'x':
14360 case 'X':
14361 base = 16;
14362 break;
14363 }
14364
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014365 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14366 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014367 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014368 }
14369 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014370 return 1;
14371 }
14372
Ethan Furmanb95b5612015-01-23 20:05:18 -080014373 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014374 Py_DECREF(iobj);
14375 if (res == NULL)
14376 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014377 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014378 return 0;
14379
14380wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014381 switch(type)
14382 {
14383 case 'o':
14384 case 'x':
14385 case 'X':
14386 PyErr_Format(PyExc_TypeError,
14387 "%%%c format: an integer is required, "
14388 "not %.200s",
14389 type, Py_TYPE(v)->tp_name);
14390 break;
14391 default:
14392 PyErr_Format(PyExc_TypeError,
14393 "%%%c format: a number is required, "
14394 "not %.200s",
14395 type, Py_TYPE(v)->tp_name);
14396 break;
14397 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014398 return -1;
14399}
14400
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014401static Py_UCS4
14402formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014403{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014404 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014405 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014406 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014407 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014409 goto onError;
14410 }
14411 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014412 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014413 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014414 /* make sure number is a type of integer */
14415 if (!PyLong_Check(v)) {
14416 iobj = PyNumber_Index(v);
14417 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014418 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014419 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014420 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014421 Py_DECREF(iobj);
14422 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014423 else {
14424 x = PyLong_AsLong(v);
14425 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014426 if (x == -1 && PyErr_Occurred())
14427 goto onError;
14428
Victor Stinner8faf8212011-12-08 22:14:11 +010014429 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014430 PyErr_SetString(PyExc_OverflowError,
14431 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014432 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 }
14434
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014435 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014436 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014437
Benjamin Peterson29060642009-01-31 22:14:21 +000014438 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014439 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014440 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014441 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014442}
14443
Victor Stinnera47082312012-10-04 02:19:54 +020014444/* Parse options of an argument: flags, width, precision.
14445 Handle also "%(name)" syntax.
14446
14447 Return 0 if the argument has been formatted into arg->str.
14448 Return 1 if the argument has been written into ctx->writer,
14449 Raise an exception and return -1 on error. */
14450static int
14451unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14452 struct unicode_format_arg_t *arg)
14453{
14454#define FORMAT_READ(ctx) \
14455 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14456
14457 PyObject *v;
14458
Victor Stinnera47082312012-10-04 02:19:54 +020014459 if (arg->ch == '(') {
14460 /* Get argument value from a dictionary. Example: "%(name)s". */
14461 Py_ssize_t keystart;
14462 Py_ssize_t keylen;
14463 PyObject *key;
14464 int pcount = 1;
14465
14466 if (ctx->dict == NULL) {
14467 PyErr_SetString(PyExc_TypeError,
14468 "format requires a mapping");
14469 return -1;
14470 }
14471 ++ctx->fmtpos;
14472 --ctx->fmtcnt;
14473 keystart = ctx->fmtpos;
14474 /* Skip over balanced parentheses */
14475 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14476 arg->ch = FORMAT_READ(ctx);
14477 if (arg->ch == ')')
14478 --pcount;
14479 else if (arg->ch == '(')
14480 ++pcount;
14481 ctx->fmtpos++;
14482 }
14483 keylen = ctx->fmtpos - keystart - 1;
14484 if (ctx->fmtcnt < 0 || pcount > 0) {
14485 PyErr_SetString(PyExc_ValueError,
14486 "incomplete format key");
14487 return -1;
14488 }
14489 key = PyUnicode_Substring(ctx->fmtstr,
14490 keystart, keystart + keylen);
14491 if (key == NULL)
14492 return -1;
14493 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014494 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014495 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014496 }
14497 ctx->args = PyObject_GetItem(ctx->dict, key);
14498 Py_DECREF(key);
14499 if (ctx->args == NULL)
14500 return -1;
14501 ctx->args_owned = 1;
14502 ctx->arglen = -1;
14503 ctx->argidx = -2;
14504 }
14505
14506 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014507 while (--ctx->fmtcnt >= 0) {
14508 arg->ch = FORMAT_READ(ctx);
14509 ctx->fmtpos++;
14510 switch (arg->ch) {
14511 case '-': arg->flags |= F_LJUST; continue;
14512 case '+': arg->flags |= F_SIGN; continue;
14513 case ' ': arg->flags |= F_BLANK; continue;
14514 case '#': arg->flags |= F_ALT; continue;
14515 case '0': arg->flags |= F_ZERO; continue;
14516 }
14517 break;
14518 }
14519
14520 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014521 if (arg->ch == '*') {
14522 v = unicode_format_getnextarg(ctx);
14523 if (v == NULL)
14524 return -1;
14525 if (!PyLong_Check(v)) {
14526 PyErr_SetString(PyExc_TypeError,
14527 "* wants int");
14528 return -1;
14529 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014530 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014531 if (arg->width == -1 && PyErr_Occurred())
14532 return -1;
14533 if (arg->width < 0) {
14534 arg->flags |= F_LJUST;
14535 arg->width = -arg->width;
14536 }
14537 if (--ctx->fmtcnt >= 0) {
14538 arg->ch = FORMAT_READ(ctx);
14539 ctx->fmtpos++;
14540 }
14541 }
14542 else if (arg->ch >= '0' && arg->ch <= '9') {
14543 arg->width = arg->ch - '0';
14544 while (--ctx->fmtcnt >= 0) {
14545 arg->ch = FORMAT_READ(ctx);
14546 ctx->fmtpos++;
14547 if (arg->ch < '0' || arg->ch > '9')
14548 break;
14549 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14550 mixing signed and unsigned comparison. Since arg->ch is between
14551 '0' and '9', casting to int is safe. */
14552 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14553 PyErr_SetString(PyExc_ValueError,
14554 "width too big");
14555 return -1;
14556 }
14557 arg->width = arg->width*10 + (arg->ch - '0');
14558 }
14559 }
14560
14561 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014562 if (arg->ch == '.') {
14563 arg->prec = 0;
14564 if (--ctx->fmtcnt >= 0) {
14565 arg->ch = FORMAT_READ(ctx);
14566 ctx->fmtpos++;
14567 }
14568 if (arg->ch == '*') {
14569 v = unicode_format_getnextarg(ctx);
14570 if (v == NULL)
14571 return -1;
14572 if (!PyLong_Check(v)) {
14573 PyErr_SetString(PyExc_TypeError,
14574 "* wants int");
14575 return -1;
14576 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014577 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014578 if (arg->prec == -1 && PyErr_Occurred())
14579 return -1;
14580 if (arg->prec < 0)
14581 arg->prec = 0;
14582 if (--ctx->fmtcnt >= 0) {
14583 arg->ch = FORMAT_READ(ctx);
14584 ctx->fmtpos++;
14585 }
14586 }
14587 else if (arg->ch >= '0' && arg->ch <= '9') {
14588 arg->prec = arg->ch - '0';
14589 while (--ctx->fmtcnt >= 0) {
14590 arg->ch = FORMAT_READ(ctx);
14591 ctx->fmtpos++;
14592 if (arg->ch < '0' || arg->ch > '9')
14593 break;
14594 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14595 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014596 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014597 return -1;
14598 }
14599 arg->prec = arg->prec*10 + (arg->ch - '0');
14600 }
14601 }
14602 }
14603
14604 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14605 if (ctx->fmtcnt >= 0) {
14606 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14607 if (--ctx->fmtcnt >= 0) {
14608 arg->ch = FORMAT_READ(ctx);
14609 ctx->fmtpos++;
14610 }
14611 }
14612 }
14613 if (ctx->fmtcnt < 0) {
14614 PyErr_SetString(PyExc_ValueError,
14615 "incomplete format");
14616 return -1;
14617 }
14618 return 0;
14619
14620#undef FORMAT_READ
14621}
14622
14623/* Format one argument. Supported conversion specifiers:
14624
14625 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014626 - "i", "d", "u": int or float
14627 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014628 - "e", "E", "f", "F", "g", "G": float
14629 - "c": int or str (1 character)
14630
Victor Stinner8dbd4212012-12-04 09:30:24 +010014631 When possible, the output is written directly into the Unicode writer
14632 (ctx->writer). A string is created when padding is required.
14633
Victor Stinnera47082312012-10-04 02:19:54 +020014634 Return 0 if the argument has been formatted into *p_str,
14635 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014636 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014637static int
14638unicode_format_arg_format(struct unicode_formatter_t *ctx,
14639 struct unicode_format_arg_t *arg,
14640 PyObject **p_str)
14641{
14642 PyObject *v;
14643 _PyUnicodeWriter *writer = &ctx->writer;
14644
14645 if (ctx->fmtcnt == 0)
14646 ctx->writer.overallocate = 0;
14647
Victor Stinnera47082312012-10-04 02:19:54 +020014648 v = unicode_format_getnextarg(ctx);
14649 if (v == NULL)
14650 return -1;
14651
Victor Stinnera47082312012-10-04 02:19:54 +020014652
14653 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014654 case 's':
14655 case 'r':
14656 case 'a':
14657 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14658 /* Fast path */
14659 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14660 return -1;
14661 return 1;
14662 }
14663
14664 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14665 *p_str = v;
14666 Py_INCREF(*p_str);
14667 }
14668 else {
14669 if (arg->ch == 's')
14670 *p_str = PyObject_Str(v);
14671 else if (arg->ch == 'r')
14672 *p_str = PyObject_Repr(v);
14673 else
14674 *p_str = PyObject_ASCII(v);
14675 }
14676 break;
14677
14678 case 'i':
14679 case 'd':
14680 case 'u':
14681 case 'o':
14682 case 'x':
14683 case 'X':
14684 {
14685 int ret = mainformatlong(v, arg, p_str, writer);
14686 if (ret != 0)
14687 return ret;
14688 arg->sign = 1;
14689 break;
14690 }
14691
14692 case 'e':
14693 case 'E':
14694 case 'f':
14695 case 'F':
14696 case 'g':
14697 case 'G':
14698 if (arg->width == -1 && arg->prec == -1
14699 && !(arg->flags & (F_SIGN | F_BLANK)))
14700 {
14701 /* Fast path */
14702 if (formatfloat(v, arg, NULL, writer) == -1)
14703 return -1;
14704 return 1;
14705 }
14706
14707 arg->sign = 1;
14708 if (formatfloat(v, arg, p_str, NULL) == -1)
14709 return -1;
14710 break;
14711
14712 case 'c':
14713 {
14714 Py_UCS4 ch = formatchar(v);
14715 if (ch == (Py_UCS4) -1)
14716 return -1;
14717 if (arg->width == -1 && arg->prec == -1) {
14718 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014719 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014720 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014721 return 1;
14722 }
14723 *p_str = PyUnicode_FromOrdinal(ch);
14724 break;
14725 }
14726
14727 default:
14728 PyErr_Format(PyExc_ValueError,
14729 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014730 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014731 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14732 (int)arg->ch,
14733 ctx->fmtpos - 1);
14734 return -1;
14735 }
14736 if (*p_str == NULL)
14737 return -1;
14738 assert (PyUnicode_Check(*p_str));
14739 return 0;
14740}
14741
14742static int
14743unicode_format_arg_output(struct unicode_formatter_t *ctx,
14744 struct unicode_format_arg_t *arg,
14745 PyObject *str)
14746{
14747 Py_ssize_t len;
14748 enum PyUnicode_Kind kind;
14749 void *pbuf;
14750 Py_ssize_t pindex;
14751 Py_UCS4 signchar;
14752 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014753 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014754 Py_ssize_t sublen;
14755 _PyUnicodeWriter *writer = &ctx->writer;
14756 Py_UCS4 fill;
14757
14758 fill = ' ';
14759 if (arg->sign && arg->flags & F_ZERO)
14760 fill = '0';
14761
14762 if (PyUnicode_READY(str) == -1)
14763 return -1;
14764
14765 len = PyUnicode_GET_LENGTH(str);
14766 if ((arg->width == -1 || arg->width <= len)
14767 && (arg->prec == -1 || arg->prec >= len)
14768 && !(arg->flags & (F_SIGN | F_BLANK)))
14769 {
14770 /* Fast path */
14771 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14772 return -1;
14773 return 0;
14774 }
14775
14776 /* Truncate the string for "s", "r" and "a" formats
14777 if the precision is set */
14778 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14779 if (arg->prec >= 0 && len > arg->prec)
14780 len = arg->prec;
14781 }
14782
14783 /* Adjust sign and width */
14784 kind = PyUnicode_KIND(str);
14785 pbuf = PyUnicode_DATA(str);
14786 pindex = 0;
14787 signchar = '\0';
14788 if (arg->sign) {
14789 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14790 if (ch == '-' || ch == '+') {
14791 signchar = ch;
14792 len--;
14793 pindex++;
14794 }
14795 else if (arg->flags & F_SIGN)
14796 signchar = '+';
14797 else if (arg->flags & F_BLANK)
14798 signchar = ' ';
14799 else
14800 arg->sign = 0;
14801 }
14802 if (arg->width < len)
14803 arg->width = len;
14804
14805 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014806 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014807 if (!(arg->flags & F_LJUST)) {
14808 if (arg->sign) {
14809 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014810 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014811 }
14812 else {
14813 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014814 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014815 }
14816 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014817 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14818 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014819 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014820 }
14821
Victor Stinnera47082312012-10-04 02:19:54 +020014822 buflen = arg->width;
14823 if (arg->sign && len == arg->width)
14824 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014825 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014826 return -1;
14827
14828 /* Write the sign if needed */
14829 if (arg->sign) {
14830 if (fill != ' ') {
14831 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14832 writer->pos += 1;
14833 }
14834 if (arg->width > len)
14835 arg->width--;
14836 }
14837
14838 /* Write the numeric prefix for "x", "X" and "o" formats
14839 if the alternate form is used.
14840 For example, write "0x" for the "%#x" format. */
14841 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14842 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14843 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14844 if (fill != ' ') {
14845 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14847 writer->pos += 2;
14848 pindex += 2;
14849 }
14850 arg->width -= 2;
14851 if (arg->width < 0)
14852 arg->width = 0;
14853 len -= 2;
14854 }
14855
14856 /* Pad left with the fill character if needed */
14857 if (arg->width > len && !(arg->flags & F_LJUST)) {
14858 sublen = arg->width - len;
14859 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14860 writer->pos += sublen;
14861 arg->width = len;
14862 }
14863
14864 /* If padding with spaces: write sign if needed and/or numeric prefix if
14865 the alternate form is used */
14866 if (fill == ' ') {
14867 if (arg->sign) {
14868 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14869 writer->pos += 1;
14870 }
14871 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14872 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14873 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14874 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14875 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14876 writer->pos += 2;
14877 pindex += 2;
14878 }
14879 }
14880
14881 /* Write characters */
14882 if (len) {
14883 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14884 str, pindex, len);
14885 writer->pos += len;
14886 }
14887
14888 /* Pad right with the fill character if needed */
14889 if (arg->width > len) {
14890 sublen = arg->width - len;
14891 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14892 writer->pos += sublen;
14893 }
14894 return 0;
14895}
14896
14897/* Helper of PyUnicode_Format(): format one arg.
14898 Return 0 on success, raise an exception and return -1 on error. */
14899static int
14900unicode_format_arg(struct unicode_formatter_t *ctx)
14901{
14902 struct unicode_format_arg_t arg;
14903 PyObject *str;
14904 int ret;
14905
Victor Stinner8dbd4212012-12-04 09:30:24 +010014906 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014907 if (arg.ch == '%') {
14908 ctx->fmtpos++;
14909 ctx->fmtcnt--;
14910 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14911 return -1;
14912 return 0;
14913 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014914 arg.flags = 0;
14915 arg.width = -1;
14916 arg.prec = -1;
14917 arg.sign = 0;
14918 str = NULL;
14919
Victor Stinnera47082312012-10-04 02:19:54 +020014920 ret = unicode_format_arg_parse(ctx, &arg);
14921 if (ret == -1)
14922 return -1;
14923
14924 ret = unicode_format_arg_format(ctx, &arg, &str);
14925 if (ret == -1)
14926 return -1;
14927
14928 if (ret != 1) {
14929 ret = unicode_format_arg_output(ctx, &arg, str);
14930 Py_DECREF(str);
14931 if (ret == -1)
14932 return -1;
14933 }
14934
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014935 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014936 PyErr_SetString(PyExc_TypeError,
14937 "not all arguments converted during string formatting");
14938 return -1;
14939 }
14940 return 0;
14941}
14942
Alexander Belopolsky40018472011-02-26 01:02:56 +000014943PyObject *
14944PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014945{
Victor Stinnera47082312012-10-04 02:19:54 +020014946 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014947
Guido van Rossumd57fd912000-03-10 22:53:23 +000014948 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014949 PyErr_BadInternalCall();
14950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014951 }
Victor Stinnera47082312012-10-04 02:19:54 +020014952
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014953 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014954 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014955
14956 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014957 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14958 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14959 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14960 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014961
Victor Stinner8f674cc2013-04-17 23:02:17 +020014962 _PyUnicodeWriter_Init(&ctx.writer);
14963 ctx.writer.min_length = ctx.fmtcnt + 100;
14964 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014965
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014967 ctx.arglen = PyTuple_Size(args);
14968 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014969 }
14970 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014971 ctx.arglen = -1;
14972 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973 }
Victor Stinnera47082312012-10-04 02:19:54 +020014974 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014975 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014976 ctx.dict = args;
14977 else
14978 ctx.dict = NULL;
14979 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980
Victor Stinnera47082312012-10-04 02:19:54 +020014981 while (--ctx.fmtcnt >= 0) {
14982 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014983 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014984
14985 nonfmtpos = ctx.fmtpos++;
14986 while (ctx.fmtcnt >= 0 &&
14987 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14988 ctx.fmtpos++;
14989 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 }
Victor Stinnera47082312012-10-04 02:19:54 +020014991 if (ctx.fmtcnt < 0) {
14992 ctx.fmtpos--;
14993 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014994 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014995
Victor Stinnercfc4c132013-04-03 01:48:39 +020014996 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14997 nonfmtpos, ctx.fmtpos) < 0)
14998 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 }
15000 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015001 ctx.fmtpos++;
15002 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015003 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015004 }
15005 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015006
Victor Stinnera47082312012-10-04 02:19:54 +020015007 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015008 PyErr_SetString(PyExc_TypeError,
15009 "not all arguments converted during string formatting");
15010 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015011 }
15012
Victor Stinnera47082312012-10-04 02:19:54 +020015013 if (ctx.args_owned) {
15014 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015015 }
Victor Stinnera47082312012-10-04 02:19:54 +020015016 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017
Benjamin Peterson29060642009-01-31 22:14:21 +000015018 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015019 _PyUnicodeWriter_Dealloc(&ctx.writer);
15020 if (ctx.args_owned) {
15021 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015022 }
15023 return NULL;
15024}
15025
Jeremy Hylton938ace62002-07-17 16:30:39 +000015026static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015027unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15028
Tim Peters6d6c1a32001-08-02 04:15:00 +000015029static PyObject *
15030unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15031{
Benjamin Peterson29060642009-01-31 22:14:21 +000015032 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 static char *kwlist[] = {"object", "encoding", "errors", 0};
15034 char *encoding = NULL;
15035 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015036
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 if (type != &PyUnicode_Type)
15038 return unicode_subtype_new(type, args, kwds);
15039 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015040 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015041 return NULL;
15042 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015043 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 if (encoding == NULL && errors == NULL)
15045 return PyObject_Str(x);
15046 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015047 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015048}
15049
Guido van Rossume023fe02001-08-30 03:12:59 +000015050static PyObject *
15051unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15052{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015053 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 Py_ssize_t length, char_size;
15055 int share_wstr, share_utf8;
15056 unsigned int kind;
15057 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015058
Benjamin Peterson14339b62009-01-31 16:36:08 +000015059 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015061 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015064 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015065 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015066 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015067 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015068 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015070 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015071 if (self == NULL) {
15072 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015073 return NULL;
15074 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 kind = PyUnicode_KIND(unicode);
15076 length = PyUnicode_GET_LENGTH(unicode);
15077
15078 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015079#ifdef Py_DEBUG
15080 _PyUnicode_HASH(self) = -1;
15081#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015082 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015083#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015084 _PyUnicode_STATE(self).interned = 0;
15085 _PyUnicode_STATE(self).kind = kind;
15086 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015087 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015088 _PyUnicode_STATE(self).ready = 1;
15089 _PyUnicode_WSTR(self) = NULL;
15090 _PyUnicode_UTF8_LENGTH(self) = 0;
15091 _PyUnicode_UTF8(self) = NULL;
15092 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015093 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015094
15095 share_utf8 = 0;
15096 share_wstr = 0;
15097 if (kind == PyUnicode_1BYTE_KIND) {
15098 char_size = 1;
15099 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15100 share_utf8 = 1;
15101 }
15102 else if (kind == PyUnicode_2BYTE_KIND) {
15103 char_size = 2;
15104 if (sizeof(wchar_t) == 2)
15105 share_wstr = 1;
15106 }
15107 else {
15108 assert(kind == PyUnicode_4BYTE_KIND);
15109 char_size = 4;
15110 if (sizeof(wchar_t) == 4)
15111 share_wstr = 1;
15112 }
15113
15114 /* Ensure we won't overflow the length. */
15115 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15116 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015117 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015119 data = PyObject_MALLOC((length + 1) * char_size);
15120 if (data == NULL) {
15121 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015122 goto onError;
15123 }
15124
Victor Stinnerc3c74152011-10-02 20:39:55 +020015125 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015126 if (share_utf8) {
15127 _PyUnicode_UTF8_LENGTH(self) = length;
15128 _PyUnicode_UTF8(self) = data;
15129 }
15130 if (share_wstr) {
15131 _PyUnicode_WSTR_LENGTH(self) = length;
15132 _PyUnicode_WSTR(self) = (wchar_t *)data;
15133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015134
Christian Heimesf051e432016-09-13 20:22:02 +020015135 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015136 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015137 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015138#ifdef Py_DEBUG
15139 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15140#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015141 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015142 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015143
15144onError:
15145 Py_DECREF(unicode);
15146 Py_DECREF(self);
15147 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015148}
15149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015150PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015151"str(object='') -> str\n\
15152str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015153\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015154Create a new string object from the given object. If encoding or\n\
15155errors is specified, then the object must expose a data buffer\n\
15156that will be decoded using the given encoding and error handler.\n\
15157Otherwise, returns the result of object.__str__() (if defined)\n\
15158or repr(object).\n\
15159encoding defaults to sys.getdefaultencoding().\n\
15160errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015161
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015162static PyObject *unicode_iter(PyObject *seq);
15163
Guido van Rossumd57fd912000-03-10 22:53:23 +000015164PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015165 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015166 "str", /* tp_name */
15167 sizeof(PyUnicodeObject), /* tp_size */
15168 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015169 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015170 (destructor)unicode_dealloc, /* tp_dealloc */
15171 0, /* tp_print */
15172 0, /* tp_getattr */
15173 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015174 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 unicode_repr, /* tp_repr */
15176 &unicode_as_number, /* tp_as_number */
15177 &unicode_as_sequence, /* tp_as_sequence */
15178 &unicode_as_mapping, /* tp_as_mapping */
15179 (hashfunc) unicode_hash, /* tp_hash*/
15180 0, /* tp_call*/
15181 (reprfunc) unicode_str, /* tp_str */
15182 PyObject_GenericGetAttr, /* tp_getattro */
15183 0, /* tp_setattro */
15184 0, /* tp_as_buffer */
15185 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015186 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015187 unicode_doc, /* tp_doc */
15188 0, /* tp_traverse */
15189 0, /* tp_clear */
15190 PyUnicode_RichCompare, /* tp_richcompare */
15191 0, /* tp_weaklistoffset */
15192 unicode_iter, /* tp_iter */
15193 0, /* tp_iternext */
15194 unicode_methods, /* tp_methods */
15195 0, /* tp_members */
15196 0, /* tp_getset */
15197 &PyBaseObject_Type, /* tp_base */
15198 0, /* tp_dict */
15199 0, /* tp_descr_get */
15200 0, /* tp_descr_set */
15201 0, /* tp_dictoffset */
15202 0, /* tp_init */
15203 0, /* tp_alloc */
15204 unicode_new, /* tp_new */
15205 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015206};
15207
15208/* Initialize the Unicode implementation */
15209
Victor Stinner3a50e702011-10-18 21:21:00 +020015210int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015211{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015212 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015213 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015214 0x000A, /* LINE FEED */
15215 0x000D, /* CARRIAGE RETURN */
15216 0x001C, /* FILE SEPARATOR */
15217 0x001D, /* GROUP SEPARATOR */
15218 0x001E, /* RECORD SEPARATOR */
15219 0x0085, /* NEXT LINE */
15220 0x2028, /* LINE SEPARATOR */
15221 0x2029, /* PARAGRAPH SEPARATOR */
15222 };
15223
Fred Drakee4315f52000-05-09 19:53:39 +000015224 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015225 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015226 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015227 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015228 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015229
Guido van Rossumcacfc072002-05-24 19:01:59 +000015230 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015231 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015232
15233 /* initialize the linebreak bloom filter */
15234 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015235 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015236 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015237
Christian Heimes26532f72013-07-20 14:57:16 +020015238 if (PyType_Ready(&EncodingMapType) < 0)
15239 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015240
Benjamin Petersonc4311282012-10-30 23:21:10 -040015241 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15242 Py_FatalError("Can't initialize field name iterator type");
15243
15244 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15245 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015246
Victor Stinner3a50e702011-10-18 21:21:00 +020015247 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015248}
15249
15250/* Finalize the Unicode implementation */
15251
Christian Heimesa156e092008-02-16 07:38:31 +000015252int
15253PyUnicode_ClearFreeList(void)
15254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015255 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015256}
15257
Guido van Rossumd57fd912000-03-10 22:53:23 +000015258void
Thomas Wouters78890102000-07-22 19:25:51 +000015259_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015260{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015261 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015262
Serhiy Storchaka05997252013-01-26 12:14:02 +020015263 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015264
Serhiy Storchaka05997252013-01-26 12:14:02 +020015265 for (i = 0; i < 256; i++)
15266 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015267 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015268 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015269}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015270
Walter Dörwald16807132007-05-25 13:52:07 +000015271void
15272PyUnicode_InternInPlace(PyObject **p)
15273{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015274 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015276#ifdef Py_DEBUG
15277 assert(s != NULL);
15278 assert(_PyUnicode_CHECK(s));
15279#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015281 return;
15282#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 /* If it's a subclass, we don't really know what putting
15284 it in the interned dict might do. */
15285 if (!PyUnicode_CheckExact(s))
15286 return;
15287 if (PyUnicode_CHECK_INTERNED(s))
15288 return;
15289 if (interned == NULL) {
15290 interned = PyDict_New();
15291 if (interned == NULL) {
15292 PyErr_Clear(); /* Don't leave an exception */
15293 return;
15294 }
15295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015296 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015297 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015299 if (t == NULL) {
15300 PyErr_Clear();
15301 return;
15302 }
15303 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015304 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015305 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015306 return;
15307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 /* The two references in interned are not counted by refcnt.
15309 The deallocator will take care of this */
15310 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015311 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015312}
15313
15314void
15315PyUnicode_InternImmortal(PyObject **p)
15316{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 PyUnicode_InternInPlace(p);
15318 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015319 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 Py_INCREF(*p);
15321 }
Walter Dörwald16807132007-05-25 13:52:07 +000015322}
15323
15324PyObject *
15325PyUnicode_InternFromString(const char *cp)
15326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 PyObject *s = PyUnicode_FromString(cp);
15328 if (s == NULL)
15329 return NULL;
15330 PyUnicode_InternInPlace(&s);
15331 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015332}
15333
Alexander Belopolsky40018472011-02-26 01:02:56 +000015334void
15335_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015336{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015338 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 Py_ssize_t i, n;
15340 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015341
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 if (interned == NULL || !PyDict_Check(interned))
15343 return;
15344 keys = PyDict_Keys(interned);
15345 if (keys == NULL || !PyList_Check(keys)) {
15346 PyErr_Clear();
15347 return;
15348 }
Walter Dörwald16807132007-05-25 13:52:07 +000015349
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15351 detector, interned unicode strings are not forcibly deallocated;
15352 rather, we give them their stolen references back, and then clear
15353 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015354
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 n = PyList_GET_SIZE(keys);
15356 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015357 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015359 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015360 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015361 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015363 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 case SSTATE_NOT_INTERNED:
15365 /* XXX Shouldn't happen */
15366 break;
15367 case SSTATE_INTERNED_IMMORTAL:
15368 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 break;
15371 case SSTATE_INTERNED_MORTAL:
15372 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015373 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 break;
15375 default:
15376 Py_FatalError("Inconsistent interned string state.");
15377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015378 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015379 }
15380 fprintf(stderr, "total size of all interned strings: "
15381 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15382 "mortal/immortal\n", mortal_size, immortal_size);
15383 Py_DECREF(keys);
15384 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015385 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015386}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015387
15388
15389/********************* Unicode Iterator **************************/
15390
15391typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015392 PyObject_HEAD
15393 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015394 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395} unicodeiterobject;
15396
15397static void
15398unicodeiter_dealloc(unicodeiterobject *it)
15399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 _PyObject_GC_UNTRACK(it);
15401 Py_XDECREF(it->it_seq);
15402 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015403}
15404
15405static int
15406unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 Py_VISIT(it->it_seq);
15409 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410}
15411
15412static PyObject *
15413unicodeiter_next(unicodeiterobject *it)
15414{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015415 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015416
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 assert(it != NULL);
15418 seq = it->it_seq;
15419 if (seq == NULL)
15420 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015421 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015423 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15424 int kind = PyUnicode_KIND(seq);
15425 void *data = PyUnicode_DATA(seq);
15426 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15427 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 if (item != NULL)
15429 ++it->it_index;
15430 return item;
15431 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015432
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015434 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015436}
15437
15438static PyObject *
15439unicodeiter_len(unicodeiterobject *it)
15440{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 Py_ssize_t len = 0;
15442 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015443 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015445}
15446
15447PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15448
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015449static PyObject *
15450unicodeiter_reduce(unicodeiterobject *it)
15451{
15452 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015453 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015454 it->it_seq, it->it_index);
15455 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015456 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457 if (u == NULL)
15458 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015459 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 }
15461}
15462
15463PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15464
15465static PyObject *
15466unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15467{
15468 Py_ssize_t index = PyLong_AsSsize_t(state);
15469 if (index == -1 && PyErr_Occurred())
15470 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015471 if (it->it_seq != NULL) {
15472 if (index < 0)
15473 index = 0;
15474 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15475 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15476 it->it_index = index;
15477 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015478 Py_RETURN_NONE;
15479}
15480
15481PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15482
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015483static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015484 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015485 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015486 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15487 reduce_doc},
15488 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15489 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015491};
15492
15493PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15495 "str_iterator", /* tp_name */
15496 sizeof(unicodeiterobject), /* tp_basicsize */
15497 0, /* tp_itemsize */
15498 /* methods */
15499 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15500 0, /* tp_print */
15501 0, /* tp_getattr */
15502 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015503 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 0, /* tp_repr */
15505 0, /* tp_as_number */
15506 0, /* tp_as_sequence */
15507 0, /* tp_as_mapping */
15508 0, /* tp_hash */
15509 0, /* tp_call */
15510 0, /* tp_str */
15511 PyObject_GenericGetAttr, /* tp_getattro */
15512 0, /* tp_setattro */
15513 0, /* tp_as_buffer */
15514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15515 0, /* tp_doc */
15516 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15517 0, /* tp_clear */
15518 0, /* tp_richcompare */
15519 0, /* tp_weaklistoffset */
15520 PyObject_SelfIter, /* tp_iter */
15521 (iternextfunc)unicodeiter_next, /* tp_iternext */
15522 unicodeiter_methods, /* tp_methods */
15523 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015524};
15525
15526static PyObject *
15527unicode_iter(PyObject *seq)
15528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015529 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015530
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 if (!PyUnicode_Check(seq)) {
15532 PyErr_BadInternalCall();
15533 return NULL;
15534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015535 if (PyUnicode_READY(seq) == -1)
15536 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15538 if (it == NULL)
15539 return NULL;
15540 it->it_index = 0;
15541 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015542 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015543 _PyObject_GC_TRACK(it);
15544 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015545}
15546
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015547
15548size_t
15549Py_UNICODE_strlen(const Py_UNICODE *u)
15550{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015551 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015552}
15553
15554Py_UNICODE*
15555Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15556{
15557 Py_UNICODE *u = s1;
15558 while ((*u++ = *s2++));
15559 return s1;
15560}
15561
15562Py_UNICODE*
15563Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15564{
15565 Py_UNICODE *u = s1;
15566 while ((*u++ = *s2++))
15567 if (n-- == 0)
15568 break;
15569 return s1;
15570}
15571
15572Py_UNICODE*
15573Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15574{
15575 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015576 u1 += wcslen(u1);
15577 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015578 return s1;
15579}
15580
15581int
15582Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15583{
15584 while (*s1 && *s2 && *s1 == *s2)
15585 s1++, s2++;
15586 if (*s1 && *s2)
15587 return (*s1 < *s2) ? -1 : +1;
15588 if (*s1)
15589 return 1;
15590 if (*s2)
15591 return -1;
15592 return 0;
15593}
15594
15595int
15596Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15597{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015598 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015599 for (; n != 0; n--) {
15600 u1 = *s1;
15601 u2 = *s2;
15602 if (u1 != u2)
15603 return (u1 < u2) ? -1 : +1;
15604 if (u1 == '\0')
15605 return 0;
15606 s1++;
15607 s2++;
15608 }
15609 return 0;
15610}
15611
15612Py_UNICODE*
15613Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15614{
15615 const Py_UNICODE *p;
15616 for (p = s; *p; p++)
15617 if (*p == c)
15618 return (Py_UNICODE*)p;
15619 return NULL;
15620}
15621
15622Py_UNICODE*
15623Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15624{
15625 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015626 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015627 while (p != s) {
15628 p--;
15629 if (*p == c)
15630 return (Py_UNICODE*)p;
15631 }
15632 return NULL;
15633}
Victor Stinner331ea922010-08-10 16:37:20 +000015634
Victor Stinner71133ff2010-09-01 23:43:53 +000015635Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015636PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015637{
Victor Stinner577db2c2011-10-11 22:12:48 +020015638 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015639 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015641 if (!PyUnicode_Check(unicode)) {
15642 PyErr_BadArgument();
15643 return NULL;
15644 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015645 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015646 if (u == NULL)
15647 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015648 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015649 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015650 PyErr_NoMemory();
15651 return NULL;
15652 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015653 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015654 size *= sizeof(Py_UNICODE);
15655 copy = PyMem_Malloc(size);
15656 if (copy == NULL) {
15657 PyErr_NoMemory();
15658 return NULL;
15659 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015660 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015661 return copy;
15662}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015663
Georg Brandl66c221e2010-10-14 07:04:07 +000015664/* A _string module, to export formatter_parser and formatter_field_name_split
15665 to the string.Formatter class implemented in Python. */
15666
15667static PyMethodDef _string_methods[] = {
15668 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15669 METH_O, PyDoc_STR("split the argument as a field name")},
15670 {"formatter_parser", (PyCFunction) formatter_parser,
15671 METH_O, PyDoc_STR("parse the argument as a format string")},
15672 {NULL, NULL}
15673};
15674
15675static struct PyModuleDef _string_module = {
15676 PyModuleDef_HEAD_INIT,
15677 "_string",
15678 PyDoc_STR("string helper module"),
15679 0,
15680 _string_methods,
15681 NULL,
15682 NULL,
15683 NULL,
15684 NULL
15685};
15686
15687PyMODINIT_FUNC
15688PyInit__string(void)
15689{
15690 return PyModule_Create(&_string_module);
15691}
15692
15693
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015694#ifdef __cplusplus
15695}
15696#endif