blob: db1516ddf6565a0827db42105d0f725af60b1c9e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner50149202015-09-22 00:26:54 +0200321typedef enum {
322 _Py_ERROR_UNKNOWN=0,
323 _Py_ERROR_STRICT,
324 _Py_ERROR_SURROGATEESCAPE,
325 _Py_ERROR_REPLACE,
326 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200327 _Py_ERROR_BACKSLASHREPLACE,
328 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200329 _Py_ERROR_XMLCHARREFREPLACE,
330 _Py_ERROR_OTHER
331} _Py_error_handler;
332
333static _Py_error_handler
334get_error_handler(const char *errors)
335{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200337 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
345 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200346 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200347 }
348 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200349 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200350 }
351 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200353 }
354 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200355 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200356 }
Victor Stinner50149202015-09-22 00:26:54 +0200357 return _Py_ERROR_OTHER;
358}
359
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300360/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
361 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000362Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000363PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000364{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000365#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000367#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000368 /* This is actually an illegal character, so it should
369 not be passed to unichr. */
370 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000371#endif
372}
373
Victor Stinner910337b2011-10-03 03:20:16 +0200374#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200375int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100376_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200377{
378 PyASCIIObject *ascii;
379 unsigned int kind;
380
381 assert(PyUnicode_Check(op));
382
383 ascii = (PyASCIIObject *)op;
384 kind = ascii->state.kind;
385
Victor Stinnera3b334d2011-10-03 13:53:37 +0200386 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200388 assert(ascii->state.ready == 1);
389 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200391 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200392 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200393
Victor Stinnera41463c2011-10-04 01:05:08 +0200394 if (ascii->state.compact == 1) {
395 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200396 assert(kind == PyUnicode_1BYTE_KIND
397 || kind == PyUnicode_2BYTE_KIND
398 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200399 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200400 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200401 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100402 }
403 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200404 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
405
406 data = unicode->data.any;
407 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100408 assert(ascii->length == 0);
409 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200410 assert(ascii->state.compact == 0);
411 assert(ascii->state.ascii == 0);
412 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100413 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200414 assert(ascii->wstr != NULL);
415 assert(data == NULL);
416 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 }
418 else {
419 assert(kind == PyUnicode_1BYTE_KIND
420 || kind == PyUnicode_2BYTE_KIND
421 || kind == PyUnicode_4BYTE_KIND);
422 assert(ascii->state.compact == 0);
423 assert(ascii->state.ready == 1);
424 assert(data != NULL);
425 if (ascii->state.ascii) {
426 assert (compact->utf8 == data);
427 assert (compact->utf8_length == ascii->length);
428 }
429 else
430 assert (compact->utf8 != data);
431 }
432 }
433 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200434 if (
435#if SIZEOF_WCHAR_T == 2
436 kind == PyUnicode_2BYTE_KIND
437#else
438 kind == PyUnicode_4BYTE_KIND
439#endif
440 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200441 {
442 assert(ascii->wstr == data);
443 assert(compact->wstr_length == ascii->length);
444 } else
445 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200446 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200447
448 if (compact->utf8 == NULL)
449 assert(compact->utf8_length == 0);
450 if (ascii->wstr == NULL)
451 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200452 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 /* check that the best kind is used */
454 if (check_content && kind != PyUnicode_WCHAR_KIND)
455 {
456 Py_ssize_t i;
457 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200458 void *data;
459 Py_UCS4 ch;
460
461 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200462 for (i=0; i < ascii->length; i++)
463 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200464 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 if (ch > maxchar)
466 maxchar = ch;
467 }
468 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100469 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200470 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 assert(maxchar <= 255);
472 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200473 else
474 assert(maxchar < 128);
475 }
Victor Stinner77faf692011-11-20 18:56:05 +0100476 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200477 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100478 assert(maxchar <= 0xFFFF);
479 }
480 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200481 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100482 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100483 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200484 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200485 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400486 return 1;
487}
Victor Stinner910337b2011-10-03 03:20:16 +0200488#endif
489
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490static PyObject*
491unicode_result_wchar(PyObject *unicode)
492{
493#ifndef Py_DEBUG
494 Py_ssize_t len;
495
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 len = _PyUnicode_WSTR_LENGTH(unicode);
497 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100498 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100500 }
501
502 if (len == 1) {
503 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100504 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100505 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
506 Py_DECREF(unicode);
507 return latin1_char;
508 }
509 }
510
511 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200512 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100513 return NULL;
514 }
515#else
Victor Stinneraa771272012-10-04 02:32:58 +0200516 assert(Py_REFCNT(unicode) == 1);
517
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 /* don't make the result ready in debug mode to ensure that the caller
519 makes the string ready before using it */
520 assert(_PyUnicode_CheckConsistency(unicode, 1));
521#endif
522 return unicode;
523}
524
525static PyObject*
526unicode_result_ready(PyObject *unicode)
527{
528 Py_ssize_t length;
529
530 length = PyUnicode_GET_LENGTH(unicode);
531 if (length == 0) {
532 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200534 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100535 }
536 return unicode_empty;
537 }
538
539 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200540 void *data = PyUnicode_DATA(unicode);
541 int kind = PyUnicode_KIND(unicode);
542 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100543 if (ch < 256) {
544 PyObject *latin1_char = unicode_latin1[ch];
545 if (latin1_char != NULL) {
546 if (unicode != latin1_char) {
547 Py_INCREF(latin1_char);
548 Py_DECREF(unicode);
549 }
550 return latin1_char;
551 }
552 else {
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 Py_INCREF(unicode);
555 unicode_latin1[ch] = unicode;
556 return unicode;
557 }
558 }
559 }
560
561 assert(_PyUnicode_CheckConsistency(unicode, 1));
562 return unicode;
563}
564
565static PyObject*
566unicode_result(PyObject *unicode)
567{
568 assert(_PyUnicode_CHECK(unicode));
569 if (PyUnicode_IS_READY(unicode))
570 return unicode_result_ready(unicode);
571 else
572 return unicode_result_wchar(unicode);
573}
574
Victor Stinnerc4b49542011-12-11 22:44:26 +0100575static PyObject*
576unicode_result_unchanged(PyObject *unicode)
577{
578 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500579 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100580 return NULL;
581 Py_INCREF(unicode);
582 return unicode;
583 }
584 else
585 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100586 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100587}
588
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200589/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
590 ASCII, Latin1, UTF-8, etc. */
591static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200592backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
594{
Victor Stinnerad771582015-10-09 12:38:53 +0200595 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200596 Py_UCS4 ch;
597 enum PyUnicode_Kind kind;
598 void *data;
599
600 assert(PyUnicode_IS_READY(unicode));
601 kind = PyUnicode_KIND(unicode);
602 data = PyUnicode_DATA(unicode);
603
604 size = 0;
605 /* determine replacement size */
606 for (i = collstart; i < collend; ++i) {
607 Py_ssize_t incr;
608
609 ch = PyUnicode_READ(kind, data, i);
610 if (ch < 0x100)
611 incr = 2+2;
612 else if (ch < 0x10000)
613 incr = 2+4;
614 else {
615 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200616 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200617 }
618 if (size > PY_SSIZE_T_MAX - incr) {
619 PyErr_SetString(PyExc_OverflowError,
620 "encoded result is too long for a Python string");
621 return NULL;
622 }
623 size += incr;
624 }
625
Victor Stinnerad771582015-10-09 12:38:53 +0200626 str = _PyBytesWriter_Prepare(writer, str, size);
627 if (str == NULL)
628 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629
630 /* generate replacement */
631 for (i = collstart; i < collend; ++i) {
632 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200633 *str++ = '\\';
634 if (ch >= 0x00010000) {
635 *str++ = 'U';
636 *str++ = Py_hexdigits[(ch>>28)&0xf];
637 *str++ = Py_hexdigits[(ch>>24)&0xf];
638 *str++ = Py_hexdigits[(ch>>20)&0xf];
639 *str++ = Py_hexdigits[(ch>>16)&0xf];
640 *str++ = Py_hexdigits[(ch>>12)&0xf];
641 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200642 }
Victor Stinner797485e2015-10-09 03:17:30 +0200643 else if (ch >= 0x100) {
644 *str++ = 'u';
645 *str++ = Py_hexdigits[(ch>>12)&0xf];
646 *str++ = Py_hexdigits[(ch>>8)&0xf];
647 }
648 else
649 *str++ = 'x';
650 *str++ = Py_hexdigits[(ch>>4)&0xf];
651 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200652 }
653 return str;
654}
655
656/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 10)
678 incr = 2+1+1;
679 else if (ch < 100)
680 incr = 2+2+1;
681 else if (ch < 1000)
682 incr = 2+3+1;
683 else if (ch < 10000)
684 incr = 2+4+1;
685 else if (ch < 100000)
686 incr = 2+5+1;
687 else if (ch < 1000000)
688 incr = 2+6+1;
689 else {
690 assert(ch <= MAX_UNICODE);
691 incr = 2+7+1;
692 }
693 if (size > PY_SSIZE_T_MAX - incr) {
694 PyErr_SetString(PyExc_OverflowError,
695 "encoded result is too long for a Python string");
696 return NULL;
697 }
698 size += incr;
699 }
700
Victor Stinnerad771582015-10-09 12:38:53 +0200701 str = _PyBytesWriter_Prepare(writer, str, size);
702 if (str == NULL)
703 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704
705 /* generate replacement */
706 for (i = collstart; i < collend; ++i) {
707 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
708 }
709 return str;
710}
711
Thomas Wouters477c8d52006-05-27 19:21:47 +0000712/* --- Bloom Filters ----------------------------------------------------- */
713
714/* stuff to implement simple "bloom filters" for Unicode characters.
715 to keep things simple, we use a single bitmask, using the least 5
716 bits from each unicode characters as the bit index. */
717
718/* the linebreak mask is set up by Unicode_Init below */
719
Antoine Pitrouf068f942010-01-13 14:19:12 +0000720#if LONG_BIT >= 128
721#define BLOOM_WIDTH 128
722#elif LONG_BIT >= 64
723#define BLOOM_WIDTH 64
724#elif LONG_BIT >= 32
725#define BLOOM_WIDTH 32
726#else
727#error "LONG_BIT is smaller than 32"
728#endif
729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730#define BLOOM_MASK unsigned long
731
Serhiy Storchaka05997252013-01-26 12:14:02 +0200732static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000733
Antoine Pitrouf068f942010-01-13 14:19:12 +0000734#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000735
Benjamin Peterson29060642009-01-31 22:14:21 +0000736#define BLOOM_LINEBREAK(ch) \
737 ((ch) < 128U ? ascii_linebreak[(ch)] : \
738 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000739
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700740static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000742{
Victor Stinnera85af502013-04-09 21:53:54 +0200743#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
744 do { \
745 TYPE *data = (TYPE *)PTR; \
746 TYPE *end = data + LEN; \
747 Py_UCS4 ch; \
748 for (; data != end; data++) { \
749 ch = *data; \
750 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
751 } \
752 break; \
753 } while (0)
754
Thomas Wouters477c8d52006-05-27 19:21:47 +0000755 /* calculate simple bloom-style bitmask for a given unicode string */
756
Antoine Pitrouf068f942010-01-13 14:19:12 +0000757 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000758
759 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200760 switch (kind) {
761 case PyUnicode_1BYTE_KIND:
762 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
763 break;
764 case PyUnicode_2BYTE_KIND:
765 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
766 break;
767 case PyUnicode_4BYTE_KIND:
768 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
769 break;
770 default:
771 assert(0);
772 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000773 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200774
775#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000776}
777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300778static int
779ensure_unicode(PyObject *obj)
780{
781 if (!PyUnicode_Check(obj)) {
782 PyErr_Format(PyExc_TypeError,
783 "must be str, not %.100s",
784 Py_TYPE(obj)->tp_name);
785 return -1;
786 }
787 return PyUnicode_READY(obj);
788}
789
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200790/* Compilation of templated routines */
791
792#include "stringlib/asciilib.h"
793#include "stringlib/fastsearch.h"
794#include "stringlib/partition.h"
795#include "stringlib/split.h"
796#include "stringlib/count.h"
797#include "stringlib/find.h"
798#include "stringlib/find_max_char.h"
799#include "stringlib/localeutil.h"
800#include "stringlib/undef.h"
801
802#include "stringlib/ucs1lib.h"
803#include "stringlib/fastsearch.h"
804#include "stringlib/partition.h"
805#include "stringlib/split.h"
806#include "stringlib/count.h"
807#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300808#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200809#include "stringlib/find_max_char.h"
810#include "stringlib/localeutil.h"
811#include "stringlib/undef.h"
812
813#include "stringlib/ucs2lib.h"
814#include "stringlib/fastsearch.h"
815#include "stringlib/partition.h"
816#include "stringlib/split.h"
817#include "stringlib/count.h"
818#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300819#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200820#include "stringlib/find_max_char.h"
821#include "stringlib/localeutil.h"
822#include "stringlib/undef.h"
823
824#include "stringlib/ucs4lib.h"
825#include "stringlib/fastsearch.h"
826#include "stringlib/partition.h"
827#include "stringlib/split.h"
828#include "stringlib/count.h"
829#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300830#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200831#include "stringlib/find_max_char.h"
832#include "stringlib/localeutil.h"
833#include "stringlib/undef.h"
834
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200835#include "stringlib/unicodedefs.h"
836#include "stringlib/fastsearch.h"
837#include "stringlib/count.h"
838#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100839#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200840
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841/* --- Unicode Object ----------------------------------------------------- */
842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200844fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700846static inline Py_ssize_t
847findchar(const void *s, int kind,
848 Py_ssize_t size, Py_UCS4 ch,
849 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200851 switch (kind) {
852 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200853 if ((Py_UCS1) ch != ch)
854 return -1;
855 if (direction > 0)
856 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
857 else
858 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200859 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200860 if ((Py_UCS2) ch != ch)
861 return -1;
862 if (direction > 0)
863 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
864 else
865 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200866 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200867 if (direction > 0)
868 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
869 else
870 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200871 default:
872 assert(0);
873 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875}
876
Victor Stinnerafffce42012-10-03 23:03:17 +0200877#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000878/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200879 earlier.
880
881 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
882 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
883 invalid character in Unicode 6.0. */
884static void
885unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
886{
887 int kind = PyUnicode_KIND(unicode);
888 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
889 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
890 if (length <= old_length)
891 return;
892 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
893}
894#endif
895
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896static PyObject*
897resize_compact(PyObject *unicode, Py_ssize_t length)
898{
899 Py_ssize_t char_size;
900 Py_ssize_t struct_size;
901 Py_ssize_t new_size;
902 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100903 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200904#ifdef Py_DEBUG
905 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
906#endif
907
Victor Stinner79891572012-05-03 13:43:07 +0200908 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200909 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100910 assert(PyUnicode_IS_COMPACT(unicode));
911
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200912 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100913 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200914 struct_size = sizeof(PyASCIIObject);
915 else
916 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200917 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918
Victor Stinnerfe226c02011-10-03 03:52:20 +0200919 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
920 PyErr_NoMemory();
921 return NULL;
922 }
923 new_size = (struct_size + (length + 1) * char_size);
924
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200925 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
926 PyObject_DEL(_PyUnicode_UTF8(unicode));
927 _PyUnicode_UTF8(unicode) = NULL;
928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
929 }
Victor Stinner84def372011-12-11 20:04:56 +0100930 _Py_DEC_REFTOTAL;
931 _Py_ForgetReference(unicode);
932
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300933 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100934 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100935 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200936 PyErr_NoMemory();
937 return NULL;
938 }
Victor Stinner84def372011-12-11 20:04:56 +0100939 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100941
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200943 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100945 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200946 _PyUnicode_WSTR_LENGTH(unicode) = length;
947 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100948 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
949 PyObject_DEL(_PyUnicode_WSTR(unicode));
950 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100951 if (!PyUnicode_IS_ASCII(unicode))
952 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100953 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200954#ifdef Py_DEBUG
955 unicode_fill_invalid(unicode, old_length);
956#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
958 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200959 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960 return unicode;
961}
962
Alexander Belopolsky40018472011-02-26 01:02:56 +0000963static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200964resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965{
Victor Stinner95663112011-10-04 01:03:50 +0200966 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100967 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200969 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000970
Victor Stinnerfe226c02011-10-03 03:52:20 +0200971 if (PyUnicode_IS_READY(unicode)) {
972 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200973 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200974 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200975#ifdef Py_DEBUG
976 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
977#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978
979 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200980 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200981 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
982 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200983
984 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
985 PyErr_NoMemory();
986 return -1;
987 }
988 new_size = (length + 1) * char_size;
989
Victor Stinner7a9105a2011-12-12 00:13:42 +0100990 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
991 {
992 PyObject_DEL(_PyUnicode_UTF8(unicode));
993 _PyUnicode_UTF8(unicode) = NULL;
994 _PyUnicode_UTF8_LENGTH(unicode) = 0;
995 }
996
Victor Stinnerfe226c02011-10-03 03:52:20 +0200997 data = (PyObject *)PyObject_REALLOC(data, new_size);
998 if (data == NULL) {
999 PyErr_NoMemory();
1000 return -1;
1001 }
1002 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001003 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001005 _PyUnicode_WSTR_LENGTH(unicode) = length;
1006 }
1007 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001008 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001009 _PyUnicode_UTF8_LENGTH(unicode) = length;
1010 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001011 _PyUnicode_LENGTH(unicode) = length;
1012 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015#endif
Victor Stinner95663112011-10-04 01:03:50 +02001016 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001017 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001020 }
Victor Stinner95663112011-10-04 01:03:50 +02001021 assert(_PyUnicode_WSTR(unicode) != NULL);
1022
1023 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001024 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001025 PyErr_NoMemory();
1026 return -1;
1027 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001029 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001031 if (!wstr) {
1032 PyErr_NoMemory();
1033 return -1;
1034 }
1035 _PyUnicode_WSTR(unicode) = wstr;
1036 _PyUnicode_WSTR(unicode)[length] = 0;
1037 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001038 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 return 0;
1040}
1041
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042static PyObject*
1043resize_copy(PyObject *unicode, Py_ssize_t length)
1044{
1045 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001046 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001048
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001049 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050
1051 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1052 if (copy == NULL)
1053 return NULL;
1054
1055 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001056 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001058 }
1059 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001060 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001061
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001062 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 if (w == NULL)
1064 return NULL;
1065 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1066 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001067 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001068 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001069 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 }
1071}
1072
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001074 Ux0000 terminated; some code (e.g. new_identifier)
1075 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076
1077 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001078 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079
1080*/
1081
Alexander Belopolsky40018472011-02-26 01:02:56 +00001082static PyUnicodeObject *
1083_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001085 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087
Thomas Wouters477c8d52006-05-27 19:21:47 +00001088 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 if (length == 0 && unicode_empty != NULL) {
1090 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001091 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 }
1093
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001094 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001095 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001096 return (PyUnicodeObject *)PyErr_NoMemory();
1097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 if (length < 0) {
1099 PyErr_SetString(PyExc_SystemError,
1100 "Negative size passed to _PyUnicode_New");
1101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 }
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1105 if (unicode == NULL)
1106 return NULL;
1107 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001108
1109 _PyUnicode_WSTR_LENGTH(unicode) = length;
1110 _PyUnicode_HASH(unicode) = -1;
1111 _PyUnicode_STATE(unicode).interned = 0;
1112 _PyUnicode_STATE(unicode).kind = 0;
1113 _PyUnicode_STATE(unicode).compact = 0;
1114 _PyUnicode_STATE(unicode).ready = 0;
1115 _PyUnicode_STATE(unicode).ascii = 0;
1116 _PyUnicode_DATA_ANY(unicode) = NULL;
1117 _PyUnicode_LENGTH(unicode) = 0;
1118 _PyUnicode_UTF8(unicode) = NULL;
1119 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1122 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001123 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001125 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127
Jeremy Hyltond8082792003-09-16 19:41:39 +00001128 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001129 * the caller fails before initializing str -- unicode_resize()
1130 * reads str[0], and the Keep-Alive optimization can keep memory
1131 * allocated for str alive across a call to unicode_dealloc(unicode).
1132 * We don't want unicode_resize to read uninitialized memory in
1133 * that case.
1134 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 _PyUnicode_WSTR(unicode)[0] = 0;
1136 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001137
Victor Stinner7931d9a2011-11-04 00:22:48 +01001138 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139 return unicode;
1140}
1141
Victor Stinnerf42dc442011-10-02 23:33:16 +02001142static const char*
1143unicode_kind_name(PyObject *unicode)
1144{
Victor Stinner42dfd712011-10-03 14:41:45 +02001145 /* don't check consistency: unicode_kind_name() is called from
1146 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001147 if (!PyUnicode_IS_COMPACT(unicode))
1148 {
1149 if (!PyUnicode_IS_READY(unicode))
1150 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001151 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001152 {
1153 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001154 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001155 return "legacy ascii";
1156 else
1157 return "legacy latin1";
1158 case PyUnicode_2BYTE_KIND:
1159 return "legacy UCS2";
1160 case PyUnicode_4BYTE_KIND:
1161 return "legacy UCS4";
1162 default:
1163 return "<legacy invalid kind>";
1164 }
1165 }
1166 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001167 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001168 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001169 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001170 return "ascii";
1171 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001172 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001173 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001174 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001175 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001176 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001177 default:
1178 return "<invalid compact kind>";
1179 }
1180}
1181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183/* Functions wrapping macros for use in debugger */
1184char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001185 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186}
1187
1188void *_PyUnicode_compact_data(void *unicode) {
1189 return _PyUnicode_COMPACT_DATA(unicode);
1190}
1191void *_PyUnicode_data(void *unicode){
1192 printf("obj %p\n", unicode);
1193 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1194 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1195 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1196 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1197 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1198 return PyUnicode_DATA(unicode);
1199}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001200
1201void
1202_PyUnicode_Dump(PyObject *op)
1203{
1204 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001205 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1206 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1207 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001208
Victor Stinnera849a4b2011-10-03 12:12:11 +02001209 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001210 {
1211 if (ascii->state.ascii)
1212 data = (ascii + 1);
1213 else
1214 data = (compact + 1);
1215 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001216 else
1217 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001218 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1219 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001220
Victor Stinnera849a4b2011-10-03 12:12:11 +02001221 if (ascii->wstr == data)
1222 printf("shared ");
1223 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001224
Victor Stinnera3b334d2011-10-03 13:53:37 +02001225 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001226 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001227 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1228 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001229 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1230 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001231 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001232 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001233}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234#endif
1235
1236PyObject *
1237PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1238{
1239 PyObject *obj;
1240 PyCompactUnicodeObject *unicode;
1241 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001242 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001243 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 Py_ssize_t char_size;
1245 Py_ssize_t struct_size;
1246
1247 /* Optimization for empty strings */
1248 if (size == 0 && unicode_empty != NULL) {
1249 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001250 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
1252
Victor Stinner9e9d6892011-10-04 01:02:02 +02001253 is_ascii = 0;
1254 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 struct_size = sizeof(PyCompactUnicodeObject);
1256 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001257 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 char_size = 1;
1259 is_ascii = 1;
1260 struct_size = sizeof(PyASCIIObject);
1261 }
1262 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001263 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 char_size = 1;
1265 }
1266 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001267 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 char_size = 2;
1269 if (sizeof(wchar_t) == 2)
1270 is_sharing = 1;
1271 }
1272 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001273 if (maxchar > MAX_UNICODE) {
1274 PyErr_SetString(PyExc_SystemError,
1275 "invalid maximum character passed to PyUnicode_New");
1276 return NULL;
1277 }
Victor Stinner8f825062012-04-27 13:55:39 +02001278 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 char_size = 4;
1280 if (sizeof(wchar_t) == 4)
1281 is_sharing = 1;
1282 }
1283
1284 /* Ensure we won't overflow the size. */
1285 if (size < 0) {
1286 PyErr_SetString(PyExc_SystemError,
1287 "Negative size passed to PyUnicode_New");
1288 return NULL;
1289 }
1290 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1291 return PyErr_NoMemory();
1292
1293 /* Duplicated allocation code from _PyObject_New() instead of a call to
1294 * PyObject_New() so we are able to allocate space for the object and
1295 * it's data buffer.
1296 */
1297 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1298 if (obj == NULL)
1299 return PyErr_NoMemory();
1300 obj = PyObject_INIT(obj, &PyUnicode_Type);
1301 if (obj == NULL)
1302 return NULL;
1303
1304 unicode = (PyCompactUnicodeObject *)obj;
1305 if (is_ascii)
1306 data = ((PyASCIIObject*)obj) + 1;
1307 else
1308 data = unicode + 1;
1309 _PyUnicode_LENGTH(unicode) = size;
1310 _PyUnicode_HASH(unicode) = -1;
1311 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001312 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 _PyUnicode_STATE(unicode).compact = 1;
1314 _PyUnicode_STATE(unicode).ready = 1;
1315 _PyUnicode_STATE(unicode).ascii = is_ascii;
1316 if (is_ascii) {
1317 ((char*)data)[size] = 0;
1318 _PyUnicode_WSTR(unicode) = NULL;
1319 }
Victor Stinner8f825062012-04-27 13:55:39 +02001320 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ((char*)data)[size] = 0;
1322 _PyUnicode_WSTR(unicode) = NULL;
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001325 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 else {
1328 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001329 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001330 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001332 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 ((Py_UCS4*)data)[size] = 0;
1334 if (is_sharing) {
1335 _PyUnicode_WSTR_LENGTH(unicode) = size;
1336 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1337 }
1338 else {
1339 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1340 _PyUnicode_WSTR(unicode) = NULL;
1341 }
1342 }
Victor Stinner8f825062012-04-27 13:55:39 +02001343#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001344 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001345#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001346 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 return obj;
1348}
1349
1350#if SIZEOF_WCHAR_T == 2
1351/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1352 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001353 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354
1355 This function assumes that unicode can hold one more code point than wstr
1356 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001357static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001359 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360{
1361 const wchar_t *iter;
1362 Py_UCS4 *ucs4_out;
1363
Victor Stinner910337b2011-10-03 03:20:16 +02001364 assert(unicode != NULL);
1365 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1367 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1368
1369 for (iter = begin; iter < end; ) {
1370 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1371 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001372 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1373 && (iter+1) < end
1374 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 {
Victor Stinner551ac952011-11-29 22:58:13 +01001376 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 iter += 2;
1378 }
1379 else {
1380 *ucs4_out++ = *iter;
1381 iter++;
1382 }
1383 }
1384 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1385 _PyUnicode_GET_LENGTH(unicode)));
1386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387}
1388#endif
1389
Victor Stinnercd9950f2011-10-02 00:34:53 +02001390static int
Victor Stinner488fa492011-12-12 00:01:39 +01001391unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001392{
Victor Stinner488fa492011-12-12 00:01:39 +01001393 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001394 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001395 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001396 return -1;
1397 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001398 return 0;
1399}
1400
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001401static int
1402_copy_characters(PyObject *to, Py_ssize_t to_start,
1403 PyObject *from, Py_ssize_t from_start,
1404 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001406 unsigned int from_kind, to_kind;
1407 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinneree4544c2012-05-09 22:24:08 +02001409 assert(0 <= how_many);
1410 assert(0 <= from_start);
1411 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001413 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001414 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Victor Stinnerd3f08822012-05-29 12:57:52 +02001416 assert(PyUnicode_Check(to));
1417 assert(PyUnicode_IS_READY(to));
1418 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1419
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001420 if (how_many == 0)
1421 return 0;
1422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001424 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001426 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427
Victor Stinnerf1852262012-06-16 16:38:26 +02001428#ifdef Py_DEBUG
1429 if (!check_maxchar
1430 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1431 {
1432 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1433 Py_UCS4 ch;
1434 Py_ssize_t i;
1435 for (i=0; i < how_many; i++) {
1436 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1437 assert(ch <= to_maxchar);
1438 }
1439 }
1440#endif
1441
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001442 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001443 if (check_maxchar
1444 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1445 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001446 /* Writing Latin-1 characters into an ASCII string requires to
1447 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001448 Py_UCS4 max_char;
1449 max_char = ucs1lib_find_max_char(from_data,
1450 (Py_UCS1*)from_data + how_many);
1451 if (max_char >= 128)
1452 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001453 }
Christian Heimesf051e432016-09-13 20:22:02 +02001454 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001455 (char*)from_data + from_kind * from_start,
1456 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001458 else if (from_kind == PyUnicode_1BYTE_KIND
1459 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 {
1461 _PyUnicode_CONVERT_BYTES(
1462 Py_UCS1, Py_UCS2,
1463 PyUnicode_1BYTE_DATA(from) + from_start,
1464 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1465 PyUnicode_2BYTE_DATA(to) + to_start
1466 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001467 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001468 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001469 && to_kind == PyUnicode_4BYTE_KIND)
1470 {
1471 _PyUnicode_CONVERT_BYTES(
1472 Py_UCS1, Py_UCS4,
1473 PyUnicode_1BYTE_DATA(from) + from_start,
1474 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1475 PyUnicode_4BYTE_DATA(to) + to_start
1476 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001477 }
1478 else if (from_kind == PyUnicode_2BYTE_KIND
1479 && to_kind == PyUnicode_4BYTE_KIND)
1480 {
1481 _PyUnicode_CONVERT_BYTES(
1482 Py_UCS2, Py_UCS4,
1483 PyUnicode_2BYTE_DATA(from) + from_start,
1484 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1485 PyUnicode_4BYTE_DATA(to) + to_start
1486 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001487 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001489 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1490
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001491 if (!check_maxchar) {
1492 if (from_kind == PyUnicode_2BYTE_KIND
1493 && to_kind == PyUnicode_1BYTE_KIND)
1494 {
1495 _PyUnicode_CONVERT_BYTES(
1496 Py_UCS2, Py_UCS1,
1497 PyUnicode_2BYTE_DATA(from) + from_start,
1498 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1499 PyUnicode_1BYTE_DATA(to) + to_start
1500 );
1501 }
1502 else if (from_kind == PyUnicode_4BYTE_KIND
1503 && to_kind == PyUnicode_1BYTE_KIND)
1504 {
1505 _PyUnicode_CONVERT_BYTES(
1506 Py_UCS4, Py_UCS1,
1507 PyUnicode_4BYTE_DATA(from) + from_start,
1508 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1509 PyUnicode_1BYTE_DATA(to) + to_start
1510 );
1511 }
1512 else if (from_kind == PyUnicode_4BYTE_KIND
1513 && to_kind == PyUnicode_2BYTE_KIND)
1514 {
1515 _PyUnicode_CONVERT_BYTES(
1516 Py_UCS4, Py_UCS2,
1517 PyUnicode_4BYTE_DATA(from) + from_start,
1518 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1519 PyUnicode_2BYTE_DATA(to) + to_start
1520 );
1521 }
1522 else {
1523 assert(0);
1524 return -1;
1525 }
1526 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001527 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001528 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 Py_ssize_t i;
1531
Victor Stinnera0702ab2011-09-29 14:14:38 +02001532 for (i=0; i < how_many; i++) {
1533 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001534 if (ch > to_maxchar)
1535 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001536 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1537 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001538 }
1539 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001540 return 0;
1541}
1542
Victor Stinnerd3f08822012-05-29 12:57:52 +02001543void
1544_PyUnicode_FastCopyCharacters(
1545 PyObject *to, Py_ssize_t to_start,
1546 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547{
1548 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1549}
1550
1551Py_ssize_t
1552PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1553 PyObject *from, Py_ssize_t from_start,
1554 Py_ssize_t how_many)
1555{
1556 int err;
1557
1558 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1559 PyErr_BadInternalCall();
1560 return -1;
1561 }
1562
Benjamin Petersonbac79492012-01-14 13:34:47 -05001563 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001565 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001566 return -1;
1567
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001568 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001569 PyErr_SetString(PyExc_IndexError, "string index out of range");
1570 return -1;
1571 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001572 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001573 PyErr_SetString(PyExc_IndexError, "string index out of range");
1574 return -1;
1575 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001576 if (how_many < 0) {
1577 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1578 return -1;
1579 }
1580 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001581 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1582 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001583 "Cannot write %zi characters at %zi "
1584 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001585 how_many, to_start, PyUnicode_GET_LENGTH(to));
1586 return -1;
1587 }
1588
1589 if (how_many == 0)
1590 return 0;
1591
Victor Stinner488fa492011-12-12 00:01:39 +01001592 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001593 return -1;
1594
1595 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1596 if (err) {
1597 PyErr_Format(PyExc_SystemError,
1598 "Cannot copy %s characters "
1599 "into a string of %s characters",
1600 unicode_kind_name(from),
1601 unicode_kind_name(to));
1602 return -1;
1603 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001604 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605}
1606
Victor Stinner17222162011-09-28 22:15:37 +02001607/* Find the maximum code point and count the number of surrogate pairs so a
1608 correct string length can be computed before converting a string to UCS4.
1609 This function counts single surrogates as a character and not as a pair.
1610
1611 Return 0 on success, or -1 on error. */
1612static int
1613find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1614 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615{
1616 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001617 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618
Victor Stinnerc53be962011-10-02 21:33:54 +02001619 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 *num_surrogates = 0;
1621 *maxchar = 0;
1622
1623 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001624#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001625 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1626 && (iter+1) < end
1627 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1628 {
1629 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1630 ++(*num_surrogates);
1631 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 }
1633 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001635 {
1636 ch = *iter;
1637 iter++;
1638 }
1639 if (ch > *maxchar) {
1640 *maxchar = ch;
1641 if (*maxchar > MAX_UNICODE) {
1642 PyErr_Format(PyExc_ValueError,
1643 "character U+%x is not in range [U+0000; U+10ffff]",
1644 ch);
1645 return -1;
1646 }
1647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 }
1649 return 0;
1650}
1651
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001652int
1653_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654{
1655 wchar_t *end;
1656 Py_UCS4 maxchar = 0;
1657 Py_ssize_t num_surrogates;
1658#if SIZEOF_WCHAR_T == 2
1659 Py_ssize_t length_wo_surrogates;
1660#endif
1661
Georg Brandl7597add2011-10-05 16:36:47 +02001662 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001663 strings were created using _PyObject_New() and where no canonical
1664 representation (the str field) has been set yet aka strings
1665 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001666 assert(_PyUnicode_CHECK(unicode));
1667 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001669 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001670 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001671 /* Actually, it should neither be interned nor be anything else: */
1672 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001675 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001676 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678
1679 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001680 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1681 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 PyErr_NoMemory();
1683 return -1;
1684 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001685 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 _PyUnicode_WSTR(unicode), end,
1687 PyUnicode_1BYTE_DATA(unicode));
1688 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1689 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1690 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1691 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001692 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001694 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 }
1696 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001697 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001698 _PyUnicode_UTF8(unicode) = NULL;
1699 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 }
1701 PyObject_FREE(_PyUnicode_WSTR(unicode));
1702 _PyUnicode_WSTR(unicode) = NULL;
1703 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1704 }
1705 /* In this case we might have to convert down from 4-byte native
1706 wchar_t to 2-byte unicode. */
1707 else if (maxchar < 65536) {
1708 assert(num_surrogates == 0 &&
1709 "FindMaxCharAndNumSurrogatePairs() messed up");
1710
Victor Stinner506f5922011-09-28 22:34:18 +02001711#if SIZEOF_WCHAR_T == 2
1712 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001713 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001719#else
1720 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001721 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001722 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001723 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001724 PyErr_NoMemory();
1725 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 }
Victor Stinner506f5922011-09-28 22:34:18 +02001727 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1728 _PyUnicode_WSTR(unicode), end,
1729 PyUnicode_2BYTE_DATA(unicode));
1730 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1731 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1732 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001733 _PyUnicode_UTF8(unicode) = NULL;
1734 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001735 PyObject_FREE(_PyUnicode_WSTR(unicode));
1736 _PyUnicode_WSTR(unicode) = NULL;
1737 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1738#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 }
1740 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1741 else {
1742#if SIZEOF_WCHAR_T == 2
1743 /* in case the native representation is 2-bytes, we need to allocate a
1744 new normalized 4-byte version. */
1745 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001746 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1747 PyErr_NoMemory();
1748 return -1;
1749 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001750 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1751 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 PyErr_NoMemory();
1753 return -1;
1754 }
1755 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001759 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1760 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001761 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 PyObject_FREE(_PyUnicode_WSTR(unicode));
1763 _PyUnicode_WSTR(unicode) = NULL;
1764 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1765#else
1766 assert(num_surrogates == 0);
1767
Victor Stinnerc3c74152011-10-02 20:39:55 +02001768 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001770 _PyUnicode_UTF8(unicode) = NULL;
1771 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1773#endif
1774 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1775 }
1776 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001777 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 return 0;
1779}
1780
Alexander Belopolsky40018472011-02-26 01:02:56 +00001781static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001782unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783{
Walter Dörwald16807132007-05-25 13:52:07 +00001784 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 case SSTATE_NOT_INTERNED:
1786 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001787
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 case SSTATE_INTERNED_MORTAL:
1789 /* revive dead object temporarily for DelItem */
1790 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001791 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 Py_FatalError(
1793 "deletion of interned string failed");
1794 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001795
Benjamin Peterson29060642009-01-31 22:14:21 +00001796 case SSTATE_INTERNED_IMMORTAL:
1797 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001798 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001799
Benjamin Peterson29060642009-01-31 22:14:21 +00001800 default:
1801 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001802 }
1803
Victor Stinner03490912011-10-03 23:45:12 +02001804 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001806 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001807 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001808 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1809 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001811 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812}
1813
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001814#ifdef Py_DEBUG
1815static int
1816unicode_is_singleton(PyObject *unicode)
1817{
1818 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1819 if (unicode == unicode_empty)
1820 return 1;
1821 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1822 {
1823 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1824 if (ch < 256 && unicode_latin1[ch] == unicode)
1825 return 1;
1826 }
1827 return 0;
1828}
1829#endif
1830
Alexander Belopolsky40018472011-02-26 01:02:56 +00001831static int
Victor Stinner488fa492011-12-12 00:01:39 +01001832unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001833{
Victor Stinner488fa492011-12-12 00:01:39 +01001834 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001835 if (Py_REFCNT(unicode) != 1)
1836 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001837 if (_PyUnicode_HASH(unicode) != -1)
1838 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001839 if (PyUnicode_CHECK_INTERNED(unicode))
1840 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001841 if (!PyUnicode_CheckExact(unicode))
1842 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001843#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001844 /* singleton refcount is greater than 1 */
1845 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001846#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001847 return 1;
1848}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001849
Victor Stinnerfe226c02011-10-03 03:52:20 +02001850static int
1851unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1852{
1853 PyObject *unicode;
1854 Py_ssize_t old_length;
1855
1856 assert(p_unicode != NULL);
1857 unicode = *p_unicode;
1858
1859 assert(unicode != NULL);
1860 assert(PyUnicode_Check(unicode));
1861 assert(0 <= length);
1862
Victor Stinner910337b2011-10-03 03:20:16 +02001863 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 old_length = PyUnicode_WSTR_LENGTH(unicode);
1865 else
1866 old_length = PyUnicode_GET_LENGTH(unicode);
1867 if (old_length == length)
1868 return 0;
1869
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001870 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001871 _Py_INCREF_UNICODE_EMPTY();
1872 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001874 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001875 return 0;
1876 }
1877
Victor Stinner488fa492011-12-12 00:01:39 +01001878 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001879 PyObject *copy = resize_copy(unicode, length);
1880 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001882 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001884 }
1885
Victor Stinnerfe226c02011-10-03 03:52:20 +02001886 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001887 PyObject *new_unicode = resize_compact(unicode, length);
1888 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001890 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001891 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001892 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001893 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894}
1895
Alexander Belopolsky40018472011-02-26 01:02:56 +00001896int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001898{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001899 PyObject *unicode;
1900 if (p_unicode == NULL) {
1901 PyErr_BadInternalCall();
1902 return -1;
1903 }
1904 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001905 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001906 {
1907 PyErr_BadInternalCall();
1908 return -1;
1909 }
1910 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001911}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001912
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001913/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001914
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001915 WARNING: The function doesn't copy the terminating null character and
1916 doesn't check the maximum character (may write a latin1 character in an
1917 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001918static void
1919unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1920 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001921{
1922 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1923 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001924 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001925
1926 switch (kind) {
1927 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001929#ifdef Py_DEBUG
1930 if (PyUnicode_IS_ASCII(unicode)) {
1931 Py_UCS4 maxchar = ucs1lib_find_max_char(
1932 (const Py_UCS1*)str,
1933 (const Py_UCS1*)str + len);
1934 assert(maxchar < 128);
1935 }
1936#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001937 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001938 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 case PyUnicode_2BYTE_KIND: {
1941 Py_UCS2 *start = (Py_UCS2 *)data + index;
1942 Py_UCS2 *ucs2 = start;
1943 assert(index <= PyUnicode_GET_LENGTH(unicode));
1944
Victor Stinner184252a2012-06-16 02:57:41 +02001945 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001946 *ucs2 = (Py_UCS2)*str;
1947
1948 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001949 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001950 }
1951 default: {
1952 Py_UCS4 *start = (Py_UCS4 *)data + index;
1953 Py_UCS4 *ucs4 = start;
1954 assert(kind == PyUnicode_4BYTE_KIND);
1955 assert(index <= PyUnicode_GET_LENGTH(unicode));
1956
Victor Stinner184252a2012-06-16 02:57:41 +02001957 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001958 *ucs4 = (Py_UCS4)*str;
1959
1960 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001961 }
1962 }
1963}
1964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965static PyObject*
1966get_latin1_char(unsigned char ch)
1967{
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001970 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 if (!unicode)
1972 return NULL;
1973 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001974 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 unicode_latin1[ch] = unicode;
1976 }
1977 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001978 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979}
1980
Victor Stinner985a82a2014-01-03 12:53:47 +01001981static PyObject*
1982unicode_char(Py_UCS4 ch)
1983{
1984 PyObject *unicode;
1985
1986 assert(ch <= MAX_UNICODE);
1987
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001988 if (ch < 256)
1989 return get_latin1_char(ch);
1990
Victor Stinner985a82a2014-01-03 12:53:47 +01001991 unicode = PyUnicode_New(1, ch);
1992 if (unicode == NULL)
1993 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001994
1995 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1996 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001998 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001999 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2000 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2001 }
2002 assert(_PyUnicode_CheckConsistency(unicode, 1));
2003 return unicode;
2004}
2005
Alexander Belopolsky40018472011-02-26 01:02:56 +00002006PyObject *
2007PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002009 if (u == NULL)
2010 return (PyObject*)_PyUnicode_New(size);
2011
2012 if (size < 0) {
2013 PyErr_BadInternalCall();
2014 return NULL;
2015 }
2016
2017 return PyUnicode_FromWideChar(u, size);
2018}
2019
2020PyObject *
2021PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2022{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002023 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 Py_UCS4 maxchar = 0;
2025 Py_ssize_t num_surrogates;
2026
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002027 if (u == NULL && size != 0) {
2028 PyErr_BadInternalCall();
2029 return NULL;
2030 }
2031
2032 if (size == -1) {
2033 size = wcslen(u);
2034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036 /* If the Unicode data is known at construction time, we can apply
2037 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002040 if (size == 0)
2041 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 /* Single character Unicode objects in the Latin-1 range are
2044 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002045 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return get_latin1_char((unsigned char)*u);
2047
2048 /* If not empty and not single character, copy the Unicode data
2049 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002050 if (find_maxchar_surrogates(u, u + size,
2051 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 return NULL;
2053
Victor Stinner8faf8212011-12-08 22:14:11 +01002054 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 if (!unicode)
2056 return NULL;
2057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 switch (PyUnicode_KIND(unicode)) {
2059 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002060 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2062 break;
2063 case PyUnicode_2BYTE_KIND:
2064#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002065 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002067 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2069#endif
2070 break;
2071 case PyUnicode_4BYTE_KIND:
2072#if SIZEOF_WCHAR_T == 2
2073 /* This is the only case which has to process surrogates, thus
2074 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002075 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076#else
2077 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002078 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079#endif
2080 break;
2081 default:
2082 assert(0 && "Impossible state");
2083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002085 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086}
2087
Alexander Belopolsky40018472011-02-26 01:02:56 +00002088PyObject *
2089PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002090{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002091 if (size < 0) {
2092 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002093 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002094 return NULL;
2095 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002096 if (u != NULL)
2097 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2098 else
2099 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002100}
2101
Alexander Belopolsky40018472011-02-26 01:02:56 +00002102PyObject *
2103PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002104{
2105 size_t size = strlen(u);
2106 if (size > PY_SSIZE_T_MAX) {
2107 PyErr_SetString(PyExc_OverflowError, "input too long");
2108 return NULL;
2109 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002110 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002111}
2112
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002113PyObject *
2114_PyUnicode_FromId(_Py_Identifier *id)
2115{
2116 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002117 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2118 strlen(id->string),
2119 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002120 if (!id->object)
2121 return NULL;
2122 PyUnicode_InternInPlace(&id->object);
2123 assert(!id->next);
2124 id->next = static_strings;
2125 static_strings = id;
2126 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002127 return id->object;
2128}
2129
2130void
2131_PyUnicode_ClearStaticStrings()
2132{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002133 _Py_Identifier *tmp, *s = static_strings;
2134 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002135 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002136 tmp = s->next;
2137 s->next = NULL;
2138 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002140 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002141}
2142
Benjamin Peterson0df54292012-03-26 14:50:32 -04002143/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002144
Victor Stinnerd3f08822012-05-29 12:57:52 +02002145PyObject*
2146_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002147{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002148 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002149 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002150 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002152 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002153#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002154 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002155 }
Victor Stinner785938e2011-12-11 20:09:03 +01002156 unicode = PyUnicode_New(size, 127);
2157 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002158 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002159 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2160 assert(_PyUnicode_CheckConsistency(unicode, 1));
2161 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002162}
2163
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002164static Py_UCS4
2165kind_maxchar_limit(unsigned int kind)
2166{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002167 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002168 case PyUnicode_1BYTE_KIND:
2169 return 0x80;
2170 case PyUnicode_2BYTE_KIND:
2171 return 0x100;
2172 case PyUnicode_4BYTE_KIND:
2173 return 0x10000;
2174 default:
2175 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002176 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002177 }
2178}
2179
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002180static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002181align_maxchar(Py_UCS4 maxchar)
2182{
2183 if (maxchar <= 127)
2184 return 127;
2185 else if (maxchar <= 255)
2186 return 255;
2187 else if (maxchar <= 65535)
2188 return 65535;
2189 else
2190 return MAX_UNICODE;
2191}
2192
Victor Stinner702c7342011-10-05 13:50:52 +02002193static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002194_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002197 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002198
Serhiy Storchaka678db842013-01-26 12:16:36 +02002199 if (size == 0)
2200 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002201 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002202 if (size == 1)
2203 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002204
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002205 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 if (!res)
2208 return NULL;
2209 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002210 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002212}
2213
Victor Stinnere57b1c02011-09-28 22:20:48 +02002214static PyObject*
2215_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216{
2217 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002218 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002219
Serhiy Storchaka678db842013-01-26 12:16:36 +02002220 if (size == 0)
2221 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002222 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002223 if (size == 1)
2224 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002225
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002226 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002227 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 if (!res)
2229 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002230 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002232 else {
2233 _PyUnicode_CONVERT_BYTES(
2234 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2235 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002236 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 return res;
2238}
2239
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240static PyObject*
2241_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242{
2243 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002244 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245
Serhiy Storchaka678db842013-01-26 12:16:36 +02002246 if (size == 0)
2247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002249 if (size == 1)
2250 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002252 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002253 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!res)
2255 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 if (max_char < 256)
2257 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2258 PyUnicode_1BYTE_DATA(res));
2259 else if (max_char < 0x10000)
2260 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2261 PyUnicode_2BYTE_DATA(res));
2262 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002264 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 return res;
2266}
2267
2268PyObject*
2269PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2270{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002271 if (size < 0) {
2272 PyErr_SetString(PyExc_ValueError, "size must be positive");
2273 return NULL;
2274 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002275 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002277 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002279 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002281 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002282 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002283 PyErr_SetString(PyExc_SystemError, "invalid kind");
2284 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286}
2287
Victor Stinnerece58de2012-04-23 23:36:38 +02002288Py_UCS4
2289_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2290{
2291 enum PyUnicode_Kind kind;
2292 void *startptr, *endptr;
2293
2294 assert(PyUnicode_IS_READY(unicode));
2295 assert(0 <= start);
2296 assert(end <= PyUnicode_GET_LENGTH(unicode));
2297 assert(start <= end);
2298
2299 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2300 return PyUnicode_MAX_CHAR_VALUE(unicode);
2301
2302 if (start == end)
2303 return 127;
2304
Victor Stinner94d558b2012-04-27 22:26:58 +02002305 if (PyUnicode_IS_ASCII(unicode))
2306 return 127;
2307
Victor Stinnerece58de2012-04-23 23:36:38 +02002308 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002309 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002310 endptr = (char *)startptr + end * kind;
2311 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002312 switch(kind) {
2313 case PyUnicode_1BYTE_KIND:
2314 return ucs1lib_find_max_char(startptr, endptr);
2315 case PyUnicode_2BYTE_KIND:
2316 return ucs2lib_find_max_char(startptr, endptr);
2317 case PyUnicode_4BYTE_KIND:
2318 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002319 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002320 assert(0);
2321 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002322 }
2323}
2324
Victor Stinner25a4b292011-10-06 12:31:55 +02002325/* Ensure that a string uses the most efficient storage, if it is not the
2326 case: create a new string with of the right kind. Write NULL into *p_unicode
2327 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002328static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002329unicode_adjust_maxchar(PyObject **p_unicode)
2330{
2331 PyObject *unicode, *copy;
2332 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002333 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002334 unsigned int kind;
2335
2336 assert(p_unicode != NULL);
2337 unicode = *p_unicode;
2338 assert(PyUnicode_IS_READY(unicode));
2339 if (PyUnicode_IS_ASCII(unicode))
2340 return;
2341
2342 len = PyUnicode_GET_LENGTH(unicode);
2343 kind = PyUnicode_KIND(unicode);
2344 if (kind == PyUnicode_1BYTE_KIND) {
2345 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002346 max_char = ucs1lib_find_max_char(u, u + len);
2347 if (max_char >= 128)
2348 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002349 }
2350 else if (kind == PyUnicode_2BYTE_KIND) {
2351 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002352 max_char = ucs2lib_find_max_char(u, u + len);
2353 if (max_char >= 256)
2354 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002355 }
2356 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002359 max_char = ucs4lib_find_max_char(u, u + len);
2360 if (max_char >= 0x10000)
2361 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002362 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002363 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002364 if (copy != NULL)
2365 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002366 Py_DECREF(unicode);
2367 *p_unicode = copy;
2368}
2369
Victor Stinner034f6cf2011-09-30 02:26:44 +02002370PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002371_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002372{
Victor Stinner87af4f22011-11-21 23:03:47 +01002373 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002374 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002375
Victor Stinner034f6cf2011-09-30 02:26:44 +02002376 if (!PyUnicode_Check(unicode)) {
2377 PyErr_BadInternalCall();
2378 return NULL;
2379 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002380 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002381 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002382
Victor Stinner87af4f22011-11-21 23:03:47 +01002383 length = PyUnicode_GET_LENGTH(unicode);
2384 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002385 if (!copy)
2386 return NULL;
2387 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2388
Christian Heimesf051e432016-09-13 20:22:02 +02002389 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002390 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002391 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002392 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002393}
2394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395
Victor Stinnerbc603d12011-10-02 01:00:40 +02002396/* Widen Unicode objects to larger buffers. Don't write terminating null
2397 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398
2399void*
2400_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2401{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002402 Py_ssize_t len;
2403 void *result;
2404 unsigned int skind;
2405
Benjamin Petersonbac79492012-01-14 13:34:47 -05002406 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002407 return NULL;
2408
2409 len = PyUnicode_GET_LENGTH(s);
2410 skind = PyUnicode_KIND(s);
2411 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002412 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return NULL;
2414 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002415 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002417 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002418 if (!result)
2419 return PyErr_NoMemory();
2420 assert(skind == PyUnicode_1BYTE_KIND);
2421 _PyUnicode_CONVERT_BYTES(
2422 Py_UCS1, Py_UCS2,
2423 PyUnicode_1BYTE_DATA(s),
2424 PyUnicode_1BYTE_DATA(s) + len,
2425 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002428 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002429 if (!result)
2430 return PyErr_NoMemory();
2431 if (skind == PyUnicode_2BYTE_KIND) {
2432 _PyUnicode_CONVERT_BYTES(
2433 Py_UCS2, Py_UCS4,
2434 PyUnicode_2BYTE_DATA(s),
2435 PyUnicode_2BYTE_DATA(s) + len,
2436 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002438 else {
2439 assert(skind == PyUnicode_1BYTE_KIND);
2440 _PyUnicode_CONVERT_BYTES(
2441 Py_UCS1, Py_UCS4,
2442 PyUnicode_1BYTE_DATA(s),
2443 PyUnicode_1BYTE_DATA(s) + len,
2444 result);
2445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002447 default:
2448 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Victor Stinner01698042011-10-04 00:04:26 +02002450 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 return NULL;
2452}
2453
2454static Py_UCS4*
2455as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2456 int copy_null)
2457{
2458 int kind;
2459 void *data;
2460 Py_ssize_t len, targetlen;
2461 if (PyUnicode_READY(string) == -1)
2462 return NULL;
2463 kind = PyUnicode_KIND(string);
2464 data = PyUnicode_DATA(string);
2465 len = PyUnicode_GET_LENGTH(string);
2466 targetlen = len;
2467 if (copy_null)
2468 targetlen++;
2469 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002470 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 if (!target) {
2472 PyErr_NoMemory();
2473 return NULL;
2474 }
2475 }
2476 else {
2477 if (targetsize < targetlen) {
2478 PyErr_Format(PyExc_SystemError,
2479 "string is longer than the buffer");
2480 if (copy_null && 0 < targetsize)
2481 target[0] = 0;
2482 return NULL;
2483 }
2484 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002485 if (kind == PyUnicode_1BYTE_KIND) {
2486 Py_UCS1 *start = (Py_UCS1 *) data;
2487 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002489 else if (kind == PyUnicode_2BYTE_KIND) {
2490 Py_UCS2 *start = (Py_UCS2 *) data;
2491 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2492 }
2493 else {
2494 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002495 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 if (copy_null)
2498 target[len] = 0;
2499 return target;
2500}
2501
2502Py_UCS4*
2503PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2504 int copy_null)
2505{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002506 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 PyErr_BadInternalCall();
2508 return NULL;
2509 }
2510 return as_ucs4(string, target, targetsize, copy_null);
2511}
2512
2513Py_UCS4*
2514PyUnicode_AsUCS4Copy(PyObject *string)
2515{
2516 return as_ucs4(string, NULL, 0, 1);
2517}
2518
Victor Stinner15a11362012-10-06 23:48:20 +02002519/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002520 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2521 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2522#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002523
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002524static int
2525unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2526 Py_ssize_t width, Py_ssize_t precision)
2527{
2528 Py_ssize_t length, fill, arglen;
2529 Py_UCS4 maxchar;
2530
2531 if (PyUnicode_READY(str) == -1)
2532 return -1;
2533
2534 length = PyUnicode_GET_LENGTH(str);
2535 if ((precision == -1 || precision >= length)
2536 && width <= length)
2537 return _PyUnicodeWriter_WriteStr(writer, str);
2538
2539 if (precision != -1)
2540 length = Py_MIN(precision, length);
2541
2542 arglen = Py_MAX(length, width);
2543 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2544 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2545 else
2546 maxchar = writer->maxchar;
2547
2548 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2549 return -1;
2550
2551 if (width > length) {
2552 fill = width - length;
2553 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2554 return -1;
2555 writer->pos += fill;
2556 }
2557
2558 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2559 str, 0, length);
2560 writer->pos += length;
2561 return 0;
2562}
2563
2564static int
2565unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2566 Py_ssize_t width, Py_ssize_t precision)
2567{
2568 /* UTF-8 */
2569 Py_ssize_t length;
2570 PyObject *unicode;
2571 int res;
2572
2573 length = strlen(str);
2574 if (precision != -1)
2575 length = Py_MIN(length, precision);
2576 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2577 if (unicode == NULL)
2578 return -1;
2579
2580 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2581 Py_DECREF(unicode);
2582 return res;
2583}
2584
Victor Stinner96865452011-03-01 23:44:09 +00002585static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002586unicode_fromformat_arg(_PyUnicodeWriter *writer,
2587 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002588{
Victor Stinnere215d962012-10-06 23:03:36 +02002589 const char *p;
2590 Py_ssize_t len;
2591 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592 Py_ssize_t width;
2593 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 int longflag;
2595 int longlongflag;
2596 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002598
2599 p = f;
2600 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002601 zeropad = 0;
2602 if (*f == '0') {
2603 zeropad = 1;
2604 f++;
2605 }
Victor Stinner96865452011-03-01 23:44:09 +00002606
2607 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 width = -1;
2609 if (Py_ISDIGIT((unsigned)*f)) {
2610 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002611 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002612 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002614 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002615 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002616 return NULL;
2617 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002619 f++;
2620 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002621 }
2622 precision = -1;
2623 if (*f == '.') {
2624 f++;
2625 if (Py_ISDIGIT((unsigned)*f)) {
2626 precision = (*f - '0');
2627 f++;
2628 while (Py_ISDIGIT((unsigned)*f)) {
2629 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2630 PyErr_SetString(PyExc_ValueError,
2631 "precision too big");
2632 return NULL;
2633 }
2634 precision = (precision * 10) + (*f - '0');
2635 f++;
2636 }
2637 }
Victor Stinner96865452011-03-01 23:44:09 +00002638 if (*f == '%') {
2639 /* "%.3%s" => f points to "3" */
2640 f--;
2641 }
2642 }
2643 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002644 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002645 f--;
2646 }
Victor Stinner96865452011-03-01 23:44:09 +00002647
2648 /* Handle %ld, %lu, %lld and %llu. */
2649 longflag = 0;
2650 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002651 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002652 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002653 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002654 longflag = 1;
2655 ++f;
2656 }
Victor Stinner96865452011-03-01 23:44:09 +00002657 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002658 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002659 longlongflag = 1;
2660 f += 2;
2661 }
Victor Stinner96865452011-03-01 23:44:09 +00002662 }
2663 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002664 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002665 size_tflag = 1;
2666 ++f;
2667 }
Victor Stinnere215d962012-10-06 23:03:36 +02002668
2669 if (f[1] == '\0')
2670 writer->overallocate = 0;
2671
2672 switch (*f) {
2673 case 'c':
2674 {
2675 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002676 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002677 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002678 "character argument not in range(0x110000)");
2679 return NULL;
2680 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002681 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002682 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002683 break;
2684 }
2685
2686 case 'i':
2687 case 'd':
2688 case 'u':
2689 case 'x':
2690 {
2691 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002692 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002693 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002697 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002698 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002699 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002700 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002701 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002702 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002703 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002704 va_arg(*vargs, size_t));
2705 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, unsigned int));
2708 }
2709 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002711 }
2712 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002713 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002714 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002715 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002716 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002717 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002718 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002719 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002720 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002721 va_arg(*vargs, Py_ssize_t));
2722 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002723 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_arg(*vargs, int));
2725 }
2726 assert(len >= 0);
2727
Victor Stinnere215d962012-10-06 23:03:36 +02002728 if (precision < len)
2729 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730
2731 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2733 return NULL;
2734
Victor Stinnere215d962012-10-06 23:03:36 +02002735 if (width > precision) {
2736 Py_UCS4 fillchar;
2737 fill = width - precision;
2738 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002739 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2740 return NULL;
2741 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002742 }
Victor Stinner15a11362012-10-06 23:48:20 +02002743 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002744 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002745 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2746 return NULL;
2747 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002748 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749
Victor Stinner4a587072013-11-19 12:54:53 +01002750 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2751 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002752 break;
2753 }
2754
2755 case 'p':
2756 {
2757 char number[MAX_LONG_LONG_CHARS];
2758
2759 len = sprintf(number, "%p", va_arg(*vargs, void*));
2760 assert(len >= 0);
2761
2762 /* %p is ill-defined: ensure leading 0x. */
2763 if (number[1] == 'X')
2764 number[1] = 'x';
2765 else if (number[1] != 'x') {
2766 memmove(number + 2, number,
2767 strlen(number) + 1);
2768 number[0] = '0';
2769 number[1] = 'x';
2770 len += 2;
2771 }
2772
Victor Stinner4a587072013-11-19 12:54:53 +01002773 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
2775 break;
2776 }
2777
2778 case 's':
2779 {
2780 /* UTF-8 */
2781 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002783 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002784 break;
2785 }
2786
2787 case 'U':
2788 {
2789 PyObject *obj = va_arg(*vargs, PyObject *);
2790 assert(obj && _PyUnicode_CHECK(obj));
2791
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002792 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002793 return NULL;
2794 break;
2795 }
2796
2797 case 'V':
2798 {
2799 PyObject *obj = va_arg(*vargs, PyObject *);
2800 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002801 if (obj) {
2802 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002804 return NULL;
2805 }
2806 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 assert(str != NULL);
2808 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002809 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002810 }
2811 break;
2812 }
2813
2814 case 'S':
2815 {
2816 PyObject *obj = va_arg(*vargs, PyObject *);
2817 PyObject *str;
2818 assert(obj);
2819 str = PyObject_Str(obj);
2820 if (!str)
2821 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002822 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002823 Py_DECREF(str);
2824 return NULL;
2825 }
2826 Py_DECREF(str);
2827 break;
2828 }
2829
2830 case 'R':
2831 {
2832 PyObject *obj = va_arg(*vargs, PyObject *);
2833 PyObject *repr;
2834 assert(obj);
2835 repr = PyObject_Repr(obj);
2836 if (!repr)
2837 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002838 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002839 Py_DECREF(repr);
2840 return NULL;
2841 }
2842 Py_DECREF(repr);
2843 break;
2844 }
2845
2846 case 'A':
2847 {
2848 PyObject *obj = va_arg(*vargs, PyObject *);
2849 PyObject *ascii;
2850 assert(obj);
2851 ascii = PyObject_ASCII(obj);
2852 if (!ascii)
2853 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002854 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002855 Py_DECREF(ascii);
2856 return NULL;
2857 }
2858 Py_DECREF(ascii);
2859 break;
2860 }
2861
2862 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002863 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002864 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002865 break;
2866
2867 default:
2868 /* if we stumble upon an unknown formatting code, copy the rest
2869 of the format string to the output string. (we cannot just
2870 skip the code, since there's no way to know what's in the
2871 argument list) */
2872 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002873 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002874 return NULL;
2875 f = p+len;
2876 return f;
2877 }
2878
2879 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002880 return f;
2881}
2882
Walter Dörwaldd2034312007-05-18 16:29:38 +00002883PyObject *
2884PyUnicode_FromFormatV(const char *format, va_list vargs)
2885{
Victor Stinnere215d962012-10-06 23:03:36 +02002886 va_list vargs2;
2887 const char *f;
2888 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002889
Victor Stinner8f674cc2013-04-17 23:02:17 +02002890 _PyUnicodeWriter_Init(&writer);
2891 writer.min_length = strlen(format) + 100;
2892 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002893
Benjamin Peterson0c212142016-09-20 20:39:33 -07002894 // Copy varags to be able to pass a reference to a subfunction.
2895 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002896
2897 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002898 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002899 f = unicode_fromformat_arg(&writer, f, &vargs2);
2900 if (f == NULL)
2901 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002904 const char *p;
2905 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002906
Victor Stinnere215d962012-10-06 23:03:36 +02002907 p = f;
2908 do
2909 {
2910 if ((unsigned char)*p > 127) {
2911 PyErr_Format(PyExc_ValueError,
2912 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2913 "string, got a non-ASCII byte: 0x%02x",
2914 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002915 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002916 }
2917 p++;
2918 }
2919 while (*p != '\0' && *p != '%');
2920 len = p - f;
2921
2922 if (*p == '\0')
2923 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002924
2925 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002927
2928 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002929 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002931 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002932 return _PyUnicodeWriter_Finish(&writer);
2933
2934 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002935 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002936 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002938}
2939
Walter Dörwaldd2034312007-05-18 16:29:38 +00002940PyObject *
2941PyUnicode_FromFormat(const char *format, ...)
2942{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 PyObject* ret;
2944 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002945
2946#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002948#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002949 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002950#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002951 ret = PyUnicode_FromFormatV(format, vargs);
2952 va_end(vargs);
2953 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954}
2955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956#ifdef HAVE_WCHAR_H
2957
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002958/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959
Victor Stinnerd88d9832011-09-06 02:00:05 +02002960 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 character) required to convert the unicode object. Ignore size argument.
2962
Victor Stinnerd88d9832011-09-06 02:00:05 +02002963 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002964 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002965 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002966Py_ssize_t
2967PyUnicode_AsWideChar(PyObject *unicode,
2968 wchar_t *w,
2969 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002970{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002971 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002972 const wchar_t *wstr;
2973
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002974 if (unicode == NULL) {
2975 PyErr_BadInternalCall();
2976 return -1;
2977 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002978 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002979 if (wstr == NULL)
2980 return -1;
2981
Victor Stinner5593d8a2010-10-02 11:11:27 +00002982 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002983 if (size > res)
2984 size = res + 1;
2985 else
2986 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002987 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002988 return res;
2989 }
2990 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002991 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002992}
2993
Victor Stinner137c34c2010-09-29 10:25:54 +00002994wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002995PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002996 Py_ssize_t *size)
2997{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002998 const wchar_t *wstr;
2999 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003000 Py_ssize_t buflen;
3001
3002 if (unicode == NULL) {
3003 PyErr_BadInternalCall();
3004 return NULL;
3005 }
3006
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003007 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3008 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003009 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003010 }
3011 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3012 PyErr_SetString(PyExc_ValueError,
3013 "embedded null character");
3014 return NULL;
3015 }
3016
3017 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003018 if (buffer == NULL) {
3019 PyErr_NoMemory();
3020 return NULL;
3021 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003022 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003023 if (size != NULL)
3024 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003025 return buffer;
3026}
3027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003028#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
Alexander Belopolsky40018472011-02-26 01:02:56 +00003030PyObject *
3031PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032{
Victor Stinner8faf8212011-12-08 22:14:11 +01003033 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 PyErr_SetString(PyExc_ValueError,
3035 "chr() arg not in range(0x110000)");
3036 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003037 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003038
Victor Stinner985a82a2014-01-03 12:53:47 +01003039 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003040}
3041
Alexander Belopolsky40018472011-02-26 01:02:56 +00003042PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003043PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003045 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003047 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003048 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003049 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 Py_INCREF(obj);
3051 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003052 }
3053 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 /* For a Unicode subtype that's not a Unicode object,
3055 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003056 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003057 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003058 PyErr_Format(PyExc_TypeError,
3059 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003060 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003061 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062}
3063
Alexander Belopolsky40018472011-02-26 01:02:56 +00003064PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003065PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003066 const char *encoding,
3067 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003070 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 PyErr_BadInternalCall();
3074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003076
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003077 /* Decoding bytes objects is the most common case and should be fast */
3078 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003079 if (PyBytes_GET_SIZE(obj) == 0)
3080 _Py_RETURN_UNICODE_EMPTY();
3081 v = PyUnicode_Decode(
3082 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3083 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003084 return v;
3085 }
3086
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003087 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 PyErr_SetString(PyExc_TypeError,
3089 "decoding str is not supported");
3090 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003091 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3094 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3095 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003096 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003097 Py_TYPE(obj)->tp_name);
3098 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003099 }
Tim Petersced69f82003-09-16 20:30:58 +00003100
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003101 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003102 PyBuffer_Release(&buffer);
3103 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003105
Serhiy Storchaka05997252013-01-26 12:14:02 +02003106 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003107 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003108 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109}
3110
Victor Stinnerebe17e02016-10-12 13:57:45 +02003111/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3112 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3113 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003114int
3115_Py_normalize_encoding(const char *encoding,
3116 char *lower,
3117 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003119 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003120 char *l;
3121 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003122 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123
Victor Stinner942889a2016-09-05 15:40:10 -07003124 assert(encoding != NULL);
3125
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003126 e = encoding;
3127 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003128 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003129 punct = 0;
3130 while (1) {
3131 char c = *e;
3132 if (c == 0) {
3133 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003134 }
Victor Stinner942889a2016-09-05 15:40:10 -07003135
3136 if (Py_ISALNUM(c) || c == '.') {
3137 if (punct && l != lower) {
3138 if (l == l_end) {
3139 return 0;
3140 }
3141 *l++ = '_';
3142 }
3143 punct = 0;
3144
3145 if (l == l_end) {
3146 return 0;
3147 }
3148 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003149 }
3150 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003151 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003152 }
Victor Stinner942889a2016-09-05 15:40:10 -07003153
3154 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003155 }
3156 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003157 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003158}
3159
Alexander Belopolsky40018472011-02-26 01:02:56 +00003160PyObject *
3161PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003162 Py_ssize_t size,
3163 const char *encoding,
3164 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003165{
3166 PyObject *buffer = NULL, *unicode;
3167 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003168 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3169
3170 if (encoding == NULL) {
3171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3172 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003173
Fred Drakee4315f52000-05-09 19:53:39 +00003174 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003175 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3176 char *lower = buflower;
3177
3178 /* Fast paths */
3179 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3180 lower += 3;
3181 if (*lower == '_') {
3182 /* Match "utf8" and "utf_8" */
3183 lower++;
3184 }
3185
3186 if (lower[0] == '8' && lower[1] == 0) {
3187 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3188 }
3189 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3190 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3191 }
3192 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3193 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3194 }
3195 }
3196 else {
3197 if (strcmp(lower, "ascii") == 0
3198 || strcmp(lower, "us_ascii") == 0) {
3199 return PyUnicode_DecodeASCII(s, size, errors);
3200 }
Steve Dowercc16be82016-09-08 10:35:16 -07003201 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003202 else if (strcmp(lower, "mbcs") == 0) {
3203 return PyUnicode_DecodeMBCS(s, size, errors);
3204 }
3205 #endif
3206 else if (strcmp(lower, "latin1") == 0
3207 || strcmp(lower, "latin_1") == 0
3208 || strcmp(lower, "iso_8859_1") == 0
3209 || strcmp(lower, "iso8859_1") == 0) {
3210 return PyUnicode_DecodeLatin1(s, size, errors);
3211 }
3212 }
Victor Stinner37296e82010-06-10 13:36:23 +00003213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214
3215 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003216 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003217 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003218 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003219 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 if (buffer == NULL)
3221 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003222 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 if (unicode == NULL)
3224 goto onError;
3225 if (!PyUnicode_Check(unicode)) {
3226 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003227 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3228 "use codecs.decode() to decode to arbitrary types",
3229 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003230 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 Py_DECREF(unicode);
3232 goto onError;
3233 }
3234 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003235 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003236
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 Py_XDECREF(buffer);
3239 return NULL;
3240}
3241
Alexander Belopolsky40018472011-02-26 01:02:56 +00003242PyObject *
3243PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003244 const char *encoding,
3245 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003247 if (!PyUnicode_Check(unicode)) {
3248 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003249 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003250 }
3251
Serhiy Storchaka00939072016-10-27 21:05:49 +03003252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3253 "PyUnicode_AsDecodedObject() is deprecated; "
3254 "use PyCodec_Decode() to decode from str", 1) < 0)
3255 return NULL;
3256
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003257 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003259
3260 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003261 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003262}
3263
Alexander Belopolsky40018472011-02-26 01:02:56 +00003264PyObject *
3265PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003266 const char *encoding,
3267 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003268{
3269 PyObject *v;
3270
3271 if (!PyUnicode_Check(unicode)) {
3272 PyErr_BadArgument();
3273 goto onError;
3274 }
3275
Serhiy Storchaka00939072016-10-27 21:05:49 +03003276 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3277 "PyUnicode_AsDecodedUnicode() is deprecated; "
3278 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3279 return NULL;
3280
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003281 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003283
3284 /* Decode via the codec registry */
3285 v = PyCodec_Decode(unicode, encoding, errors);
3286 if (v == NULL)
3287 goto onError;
3288 if (!PyUnicode_Check(v)) {
3289 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003290 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3291 "use codecs.decode() to decode to arbitrary types",
3292 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003293 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003294 Py_DECREF(v);
3295 goto onError;
3296 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003297 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003298
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003300 return NULL;
3301}
3302
Alexander Belopolsky40018472011-02-26 01:02:56 +00003303PyObject *
3304PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003305 Py_ssize_t size,
3306 const char *encoding,
3307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308{
3309 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003310
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003311 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3315 Py_DECREF(unicode);
3316 return v;
3317}
3318
Alexander Belopolsky40018472011-02-26 01:02:56 +00003319PyObject *
3320PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003321 const char *encoding,
3322 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003323{
3324 PyObject *v;
3325
3326 if (!PyUnicode_Check(unicode)) {
3327 PyErr_BadArgument();
3328 goto onError;
3329 }
3330
Serhiy Storchaka00939072016-10-27 21:05:49 +03003331 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3332 "PyUnicode_AsEncodedObject() is deprecated; "
3333 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3334 "or PyCodec_Encode() for generic encoding", 1) < 0)
3335 return NULL;
3336
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003337 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003339
3340 /* Encode via the codec registry */
3341 v = PyCodec_Encode(unicode, encoding, errors);
3342 if (v == NULL)
3343 goto onError;
3344 return v;
3345
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003347 return NULL;
3348}
3349
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003350static size_t
3351wcstombs_errorpos(const wchar_t *wstr)
3352{
3353 size_t len;
3354#if SIZEOF_WCHAR_T == 2
3355 wchar_t buf[3];
3356#else
3357 wchar_t buf[2];
3358#endif
3359 char outbuf[MB_LEN_MAX];
3360 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003361
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003362#if SIZEOF_WCHAR_T == 2
3363 buf[2] = 0;
3364#else
3365 buf[1] = 0;
3366#endif
3367 start = wstr;
3368 while (*wstr != L'\0')
3369 {
3370 previous = wstr;
3371#if SIZEOF_WCHAR_T == 2
3372 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3373 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3374 {
3375 buf[0] = wstr[0];
3376 buf[1] = wstr[1];
3377 wstr += 2;
3378 }
3379 else {
3380 buf[0] = *wstr;
3381 buf[1] = 0;
3382 wstr++;
3383 }
3384#else
3385 buf[0] = *wstr;
3386 wstr++;
3387#endif
3388 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003389 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003390 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 }
3392
3393 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 return 0;
3395}
3396
Victor Stinner1b579672011-12-17 05:47:23 +01003397static int
3398locale_error_handler(const char *errors, int *surrogateescape)
3399{
Victor Stinner50149202015-09-22 00:26:54 +02003400 _Py_error_handler error_handler = get_error_handler(errors);
3401 switch (error_handler)
3402 {
3403 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003404 *surrogateescape = 0;
3405 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003406 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003407 *surrogateescape = 1;
3408 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003409 default:
3410 PyErr_Format(PyExc_ValueError,
3411 "only 'strict' and 'surrogateescape' error handlers "
3412 "are supported, not '%s'",
3413 errors);
3414 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003415 }
Victor Stinner1b579672011-12-17 05:47:23 +01003416}
3417
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003419PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420{
3421 Py_ssize_t wlen, wlen2;
3422 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003424 PyObject *bytes, *reason, *exc;
3425 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003426 int surrogateescape;
3427
3428 if (locale_error_handler(errors, &surrogateescape) < 0)
3429 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003430
3431 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3432 if (wstr == NULL)
3433 return NULL;
3434
3435 wlen2 = wcslen(wstr);
3436 if (wlen2 != wlen) {
3437 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003438 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003439 return NULL;
3440 }
3441
3442 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003443 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003444 char *str;
3445
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003446 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003447 if (str == NULL) {
3448 if (error_pos == (size_t)-1) {
3449 PyErr_NoMemory();
3450 PyMem_Free(wstr);
3451 return NULL;
3452 }
3453 else {
3454 goto encode_error;
3455 }
3456 }
3457 PyMem_Free(wstr);
3458
3459 bytes = PyBytes_FromString(str);
3460 PyMem_Free(str);
3461 }
3462 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003463 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003464 size_t len, len2;
3465
3466 len = wcstombs(NULL, wstr, 0);
3467 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003468 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003469 goto encode_error;
3470 }
3471
3472 bytes = PyBytes_FromStringAndSize(NULL, len);
3473 if (bytes == NULL) {
3474 PyMem_Free(wstr);
3475 return NULL;
3476 }
3477
3478 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3479 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003480 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003481 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003482 goto encode_error;
3483 }
3484 PyMem_Free(wstr);
3485 }
3486 return bytes;
3487
3488encode_error:
3489 errmsg = strerror(errno);
3490 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003491
3492 if (error_pos == (size_t)-1)
3493 error_pos = wcstombs_errorpos(wstr);
3494
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003495 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003496
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003497 wstr = Py_DecodeLocale(errmsg, &errlen);
3498 if (wstr != NULL) {
3499 reason = PyUnicode_FromWideChar(wstr, errlen);
3500 PyMem_RawFree(wstr);
3501 } else {
3502 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003503 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003504
Victor Stinner2f197072011-12-17 07:08:30 +01003505 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003506 reason = PyUnicode_FromString(
3507 "wcstombs() encountered an unencodable "
3508 "wide character");
3509 if (reason == NULL)
3510 return NULL;
3511
3512 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3513 "locale", unicode,
3514 (Py_ssize_t)error_pos,
3515 (Py_ssize_t)(error_pos+1),
3516 reason);
3517 Py_DECREF(reason);
3518 if (exc != NULL) {
3519 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003520 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003521 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003522 return NULL;
3523}
3524
Victor Stinnerad158722010-10-27 00:25:46 +00003525PyObject *
3526PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003527{
Steve Dowercc16be82016-09-08 10:35:16 -07003528#if defined(__APPLE__)
3529 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003530#else
Victor Stinner793b5312011-04-27 00:24:21 +02003531 PyInterpreterState *interp = PyThreadState_GET()->interp;
3532 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3533 cannot use it to encode and decode filenames before it is loaded. Load
3534 the Python codec requires to encode at least its own filename. Use the C
3535 version of the locale codec until the codec registry is initialized and
3536 the Python codec is loaded.
3537
3538 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3539 cannot only rely on it: check also interp->fscodec_initialized for
3540 subinterpreters. */
3541 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003542 return PyUnicode_AsEncodedString(unicode,
3543 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003544 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003545 }
3546 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003547 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003548 }
Victor Stinnerad158722010-10-27 00:25:46 +00003549#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003550}
3551
Alexander Belopolsky40018472011-02-26 01:02:56 +00003552PyObject *
3553PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003554 const char *encoding,
3555 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556{
3557 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003558 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003559
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 if (!PyUnicode_Check(unicode)) {
3561 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 }
Fred Drakee4315f52000-05-09 19:53:39 +00003564
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (encoding == NULL) {
3566 return _PyUnicode_AsUTF8String(unicode, errors);
3567 }
3568
Fred Drakee4315f52000-05-09 19:53:39 +00003569 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003570 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3571 char *lower = buflower;
3572
3573 /* Fast paths */
3574 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3575 lower += 3;
3576 if (*lower == '_') {
3577 /* Match "utf8" and "utf_8" */
3578 lower++;
3579 }
3580
3581 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003582 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003583 }
3584 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3585 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3586 }
3587 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3588 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3589 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003590 }
Victor Stinner942889a2016-09-05 15:40:10 -07003591 else {
3592 if (strcmp(lower, "ascii") == 0
3593 || strcmp(lower, "us_ascii") == 0) {
3594 return _PyUnicode_AsASCIIString(unicode, errors);
3595 }
Steve Dowercc16be82016-09-08 10:35:16 -07003596#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003597 else if (strcmp(lower, "mbcs") == 0) {
3598 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3599 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003600#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003601 else if (strcmp(lower, "latin1") == 0 ||
3602 strcmp(lower, "latin_1") == 0 ||
3603 strcmp(lower, "iso_8859_1") == 0 ||
3604 strcmp(lower, "iso8859_1") == 0) {
3605 return _PyUnicode_AsLatin1String(unicode, errors);
3606 }
3607 }
Victor Stinner37296e82010-06-10 13:36:23 +00003608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609
3610 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003611 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003613 return NULL;
3614
3615 /* The normal path */
3616 if (PyBytes_Check(v))
3617 return v;
3618
3619 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003620 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003621 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003622 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003623
3624 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003625 "encoder %s returned bytearray instead of bytes; "
3626 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003627 encoding);
3628 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003629 Py_DECREF(v);
3630 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003631 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003632
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003633 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3634 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003635 Py_DECREF(v);
3636 return b;
3637 }
3638
3639 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003640 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3641 "use codecs.encode() to encode to arbitrary types",
3642 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003643 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003644 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645 return NULL;
3646}
3647
Alexander Belopolsky40018472011-02-26 01:02:56 +00003648PyObject *
3649PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003650 const char *encoding,
3651 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003652{
3653 PyObject *v;
3654
3655 if (!PyUnicode_Check(unicode)) {
3656 PyErr_BadArgument();
3657 goto onError;
3658 }
3659
Serhiy Storchaka00939072016-10-27 21:05:49 +03003660 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3661 "PyUnicode_AsEncodedUnicode() is deprecated; "
3662 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3663 return NULL;
3664
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003665 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003667
3668 /* Encode via the codec registry */
3669 v = PyCodec_Encode(unicode, encoding, errors);
3670 if (v == NULL)
3671 goto onError;
3672 if (!PyUnicode_Check(v)) {
3673 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003674 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3675 "use codecs.encode() to encode to arbitrary types",
3676 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003677 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003678 Py_DECREF(v);
3679 goto onError;
3680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003682
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 return NULL;
3685}
3686
Victor Stinner2f197072011-12-17 07:08:30 +01003687static size_t
3688mbstowcs_errorpos(const char *str, size_t len)
3689{
3690#ifdef HAVE_MBRTOWC
3691 const char *start = str;
3692 mbstate_t mbs;
3693 size_t converted;
3694 wchar_t ch;
3695
3696 memset(&mbs, 0, sizeof mbs);
3697 while (len)
3698 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003699 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003700 if (converted == 0)
3701 /* Reached end of string */
3702 break;
3703 if (converted == (size_t)-1 || converted == (size_t)-2) {
3704 /* Conversion error or incomplete character */
3705 return str - start;
3706 }
3707 else {
3708 str += converted;
3709 len -= converted;
3710 }
3711 }
3712 /* failed to find the undecodable byte sequence */
3713 return 0;
3714#endif
3715 return 0;
3716}
3717
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003720 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003721{
3722 wchar_t smallbuf[256];
3723 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3724 wchar_t *wstr;
3725 size_t wlen, wlen2;
3726 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003727 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003728 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003729 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003730 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003731
3732 if (locale_error_handler(errors, &surrogateescape) < 0)
3733 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003734
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003735 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3736 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737 return NULL;
3738 }
3739
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003740 if (surrogateescape) {
3741 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003742 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003743 if (wstr == NULL) {
3744 if (wlen == (size_t)-1)
3745 PyErr_NoMemory();
3746 else
3747 PyErr_SetFromErrno(PyExc_OSError);
3748 return NULL;
3749 }
3750
3751 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003752 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003753 }
3754 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003755 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003756#ifndef HAVE_BROKEN_MBSTOWCS
3757 wlen = mbstowcs(NULL, str, 0);
3758#else
3759 wlen = len;
3760#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003761 if (wlen == (size_t)-1)
3762 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003763 if (wlen+1 <= smallbuf_len) {
3764 wstr = smallbuf;
3765 }
3766 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003767 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003768 if (!wstr)
3769 return PyErr_NoMemory();
3770 }
3771
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003772 wlen2 = mbstowcs(wstr, str, wlen+1);
3773 if (wlen2 == (size_t)-1) {
3774 if (wstr != smallbuf)
3775 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003776 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003777 }
3778#ifdef HAVE_BROKEN_MBSTOWCS
3779 assert(wlen2 == wlen);
3780#endif
3781 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3782 if (wstr != smallbuf)
3783 PyMem_Free(wstr);
3784 }
3785 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003786
3787decode_error:
3788 errmsg = strerror(errno);
3789 assert(errmsg != NULL);
3790
3791 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003792 wstr = Py_DecodeLocale(errmsg, &errlen);
3793 if (wstr != NULL) {
3794 reason = PyUnicode_FromWideChar(wstr, errlen);
3795 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003796 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003797
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003798 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003799 reason = PyUnicode_FromString(
3800 "mbstowcs() encountered an invalid multibyte sequence");
3801 if (reason == NULL)
3802 return NULL;
3803
3804 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3805 "locale", str, len,
3806 (Py_ssize_t)error_pos,
3807 (Py_ssize_t)(error_pos+1),
3808 reason);
3809 Py_DECREF(reason);
3810 if (exc != NULL) {
3811 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003812 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003813 }
3814 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003815}
3816
3817PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003818PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003819{
3820 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003821 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003822}
3823
3824
3825PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003826PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003827 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003828 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3829}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003830
Christian Heimes5894ba72007-11-04 11:43:14 +00003831PyObject*
3832PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3833{
Steve Dowercc16be82016-09-08 10:35:16 -07003834#if defined(__APPLE__)
3835 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003836#else
Victor Stinner793b5312011-04-27 00:24:21 +02003837 PyInterpreterState *interp = PyThreadState_GET()->interp;
3838 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3839 cannot use it to encode and decode filenames before it is loaded. Load
3840 the Python codec requires to encode at least its own filename. Use the C
3841 version of the locale codec until the codec registry is initialized and
3842 the Python codec is loaded.
3843
3844 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3845 cannot only rely on it: check also interp->fscodec_initialized for
3846 subinterpreters. */
3847 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003848 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003850 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851 }
3852 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003853 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003854 }
Victor Stinnerad158722010-10-27 00:25:46 +00003855#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003856}
3857
Martin v. Löwis011e8422009-05-05 04:43:17 +00003858
3859int
3860PyUnicode_FSConverter(PyObject* arg, void* addr)
3861{
Brett Cannonec6ce872016-09-06 15:50:29 -07003862 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003863 PyObject *output = NULL;
3864 Py_ssize_t size;
3865 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003866 if (arg == NULL) {
3867 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003868 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003869 return 1;
3870 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003871 path = PyOS_FSPath(arg);
3872 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003873 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003874 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003875 if (PyBytes_Check(path)) {
3876 output = path;
3877 }
3878 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3879 output = PyUnicode_EncodeFSDefault(path);
3880 Py_DECREF(path);
3881 if (!output) {
3882 return 0;
3883 }
3884 assert(PyBytes_Check(output));
3885 }
3886
Victor Stinner0ea2a462010-04-30 00:22:08 +00003887 size = PyBytes_GET_SIZE(output);
3888 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003889 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003890 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003891 Py_DECREF(output);
3892 return 0;
3893 }
3894 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003895 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896}
3897
3898
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003899int
3900PyUnicode_FSDecoder(PyObject* arg, void* addr)
3901{
Brett Cannona5711202016-09-06 19:36:01 -07003902 int is_buffer = 0;
3903 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003904 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003905 if (arg == NULL) {
3906 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003907 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003908 return 1;
3909 }
Brett Cannona5711202016-09-06 19:36:01 -07003910
3911 is_buffer = PyObject_CheckBuffer(arg);
3912 if (!is_buffer) {
3913 path = PyOS_FSPath(arg);
3914 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003915 return 0;
3916 }
Brett Cannona5711202016-09-06 19:36:01 -07003917 }
3918 else {
3919 path = arg;
3920 Py_INCREF(arg);
3921 }
3922
3923 if (PyUnicode_Check(path)) {
3924 if (PyUnicode_READY(path) == -1) {
3925 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003927 }
3928 output = path;
3929 }
3930 else if (PyBytes_Check(path) || is_buffer) {
3931 PyObject *path_bytes = NULL;
3932
3933 if (!PyBytes_Check(path) &&
3934 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3935 "path should be string, bytes, or os.PathLike, not %.200s",
3936 Py_TYPE(arg)->tp_name)) {
3937 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003938 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003939 }
3940 path_bytes = PyBytes_FromObject(path);
3941 Py_DECREF(path);
3942 if (!path_bytes) {
3943 return 0;
3944 }
3945 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3946 PyBytes_GET_SIZE(path_bytes));
3947 Py_DECREF(path_bytes);
3948 if (!output) {
3949 return 0;
3950 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003951 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003952 else {
3953 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003954 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003955 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003956 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003957 return 0;
3958 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003959 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003960 Py_DECREF(output);
3961 return 0;
3962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003964 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003965 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003966 Py_DECREF(output);
3967 return 0;
3968 }
3969 *(PyObject**)addr = output;
3970 return Py_CLEANUP_SUPPORTED;
3971}
3972
3973
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003974const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003976{
Christian Heimesf3863112007-11-22 07:46:41 +00003977 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003979 if (!PyUnicode_Check(unicode)) {
3980 PyErr_BadArgument();
3981 return NULL;
3982 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003983 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003984 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003986 if (PyUnicode_UTF8(unicode) == NULL) {
3987 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003988 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 if (bytes == NULL)
3990 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003991 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3992 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003993 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 Py_DECREF(bytes);
3995 return NULL;
3996 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003998 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 PyBytes_AS_STRING(bytes),
4000 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 Py_DECREF(bytes);
4002 }
4003
4004 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004005 *psize = PyUnicode_UTF8_LENGTH(unicode);
4006 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004007}
4008
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004009const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4013}
4014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015Py_UNICODE *
4016PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 const unsigned char *one_byte;
4019#if SIZEOF_WCHAR_T == 4
4020 const Py_UCS2 *two_bytes;
4021#else
4022 const Py_UCS4 *four_bytes;
4023 const Py_UCS4 *ucs4_end;
4024 Py_ssize_t num_surrogates;
4025#endif
4026 wchar_t *w;
4027 wchar_t *wchar_end;
4028
4029 if (!PyUnicode_Check(unicode)) {
4030 PyErr_BadArgument();
4031 return NULL;
4032 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004033 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004035 assert(_PyUnicode_KIND(unicode) != 0);
4036 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4041 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 num_surrogates = 0;
4043
4044 for (; four_bytes < ucs4_end; ++four_bytes) {
4045 if (*four_bytes > 0xFFFF)
4046 ++num_surrogates;
4047 }
4048
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004049 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4050 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4051 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 PyErr_NoMemory();
4053 return NULL;
4054 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004057 w = _PyUnicode_WSTR(unicode);
4058 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4059 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4061 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004062 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004064 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4065 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 }
4067 else
4068 *w = *four_bytes;
4069
4070 if (w > wchar_end) {
4071 assert(0 && "Miscalculated string end");
4072 }
4073 }
4074 *w = 0;
4075#else
4076 /* sizeof(wchar_t) == 4 */
4077 Py_FatalError("Impossible unicode object state, wstr and str "
4078 "should share memory already.");
4079 return NULL;
4080#endif
4081 }
4082 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004083 if ((size_t)_PyUnicode_LENGTH(unicode) >
4084 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4085 PyErr_NoMemory();
4086 return NULL;
4087 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004088 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4089 (_PyUnicode_LENGTH(unicode) + 1));
4090 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 PyErr_NoMemory();
4092 return NULL;
4093 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004094 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4095 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4096 w = _PyUnicode_WSTR(unicode);
4097 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004099 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4100 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 for (; w < wchar_end; ++one_byte, ++w)
4102 *w = *one_byte;
4103 /* null-terminate the wstr */
4104 *w = 0;
4105 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004108 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 for (; w < wchar_end; ++two_bytes, ++w)
4110 *w = *two_bytes;
4111 /* null-terminate the wstr */
4112 *w = 0;
4113#else
4114 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004115 PyObject_FREE(_PyUnicode_WSTR(unicode));
4116 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117 Py_FatalError("Impossible unicode object state, wstr "
4118 "and str should share memory already.");
4119 return NULL;
4120#endif
4121 }
4122 else {
4123 assert(0 && "This should never happen.");
4124 }
4125 }
4126 }
4127 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004128 *size = PyUnicode_WSTR_LENGTH(unicode);
4129 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004130}
4131
Alexander Belopolsky40018472011-02-26 01:02:56 +00004132Py_UNICODE *
4133PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136}
4137
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004138const Py_UNICODE *
4139_PyUnicode_AsUnicode(PyObject *unicode)
4140{
4141 Py_ssize_t size;
4142 const Py_UNICODE *wstr;
4143
4144 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4145 if (wstr && wcslen(wstr) != (size_t)size) {
4146 PyErr_SetString(PyExc_ValueError, "embedded null character");
4147 return NULL;
4148 }
4149 return wstr;
4150}
4151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153Py_ssize_t
4154PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155{
4156 if (!PyUnicode_Check(unicode)) {
4157 PyErr_BadArgument();
4158 goto onError;
4159 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004160 if (_PyUnicode_WSTR(unicode) == NULL) {
4161 if (PyUnicode_AsUnicode(unicode) == NULL)
4162 goto onError;
4163 }
4164 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 return -1;
4168}
4169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170Py_ssize_t
4171PyUnicode_GetLength(PyObject *unicode)
4172{
Victor Stinner07621332012-06-16 04:53:46 +02004173 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174 PyErr_BadArgument();
4175 return -1;
4176 }
Victor Stinner07621332012-06-16 04:53:46 +02004177 if (PyUnicode_READY(unicode) == -1)
4178 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 return PyUnicode_GET_LENGTH(unicode);
4180}
4181
4182Py_UCS4
4183PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4184{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004185 void *data;
4186 int kind;
4187
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004188 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4189 PyErr_BadArgument();
4190 return (Py_UCS4)-1;
4191 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004192 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004193 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 return (Py_UCS4)-1;
4195 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004196 data = PyUnicode_DATA(unicode);
4197 kind = PyUnicode_KIND(unicode);
4198 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004199}
4200
4201int
4202PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4203{
4204 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004205 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206 return -1;
4207 }
Victor Stinner488fa492011-12-12 00:01:39 +01004208 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004209 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004210 PyErr_SetString(PyExc_IndexError, "string index out of range");
4211 return -1;
4212 }
Victor Stinner488fa492011-12-12 00:01:39 +01004213 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004214 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004215 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4216 PyErr_SetString(PyExc_ValueError, "character out of range");
4217 return -1;
4218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4220 index, ch);
4221 return 0;
4222}
4223
Alexander Belopolsky40018472011-02-26 01:02:56 +00004224const char *
4225PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004226{
Victor Stinner42cb4622010-09-01 19:39:01 +00004227 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004228}
4229
Victor Stinner554f3f02010-06-16 23:33:54 +00004230/* create or adjust a UnicodeDecodeError */
4231static void
4232make_decode_exception(PyObject **exceptionObject,
4233 const char *encoding,
4234 const char *input, Py_ssize_t length,
4235 Py_ssize_t startpos, Py_ssize_t endpos,
4236 const char *reason)
4237{
4238 if (*exceptionObject == NULL) {
4239 *exceptionObject = PyUnicodeDecodeError_Create(
4240 encoding, input, length, startpos, endpos, reason);
4241 }
4242 else {
4243 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4244 goto onError;
4245 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4246 goto onError;
4247 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4248 goto onError;
4249 }
4250 return;
4251
4252onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004253 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004254}
4255
Steve Dowercc16be82016-09-08 10:35:16 -07004256#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257/* error handling callback helper:
4258 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004259 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 and adjust various state variables.
4261 return 0 on success, -1 on error
4262*/
4263
Alexander Belopolsky40018472011-02-26 01:02:56 +00004264static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004265unicode_decode_call_errorhandler_wchar(
4266 const char *errors, PyObject **errorHandler,
4267 const char *encoding, const char *reason,
4268 const char **input, const char **inend, Py_ssize_t *startinpos,
4269 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4270 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004272 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273
4274 PyObject *restuple = NULL;
4275 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004276 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004277 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004278 Py_ssize_t requiredsize;
4279 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004280 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281 wchar_t *repwstr;
4282 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4285 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 *errorHandler = PyCodec_LookupError(errors);
4289 if (*errorHandler == NULL)
4290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 }
4292
Victor Stinner554f3f02010-06-16 23:33:54 +00004293 make_decode_exception(exceptionObject,
4294 encoding,
4295 *input, *inend - *input,
4296 *startinpos, *endinpos,
4297 reason);
4298 if (*exceptionObject == NULL)
4299 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004301 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004305 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004308 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310
4311 /* Copy back the bytes variables, which might have been modified by the
4312 callback */
4313 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4314 if (!inputobj)
4315 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 *input = PyBytes_AS_STRING(inputobj);
4317 insize = PyBytes_GET_SIZE(inputobj);
4318 *inend = *input + insize;
4319 /* we can DECREF safely, as the exception has another reference,
4320 so the object won't go away. */
4321 Py_DECREF(inputobj);
4322
4323 if (newpos<0)
4324 newpos = insize+newpos;
4325 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004326 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 goto onError;
4328 }
4329
4330 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4331 if (repwstr == NULL)
4332 goto onError;
4333 /* need more space? (at least enough for what we
4334 have+the replacement+the rest of the string (starting
4335 at the new input position), so we won't have to check space
4336 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004337 requiredsize = *outpos;
4338 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4339 goto overflow;
4340 requiredsize += repwlen;
4341 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4342 goto overflow;
4343 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004345 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 requiredsize = 2*outsize;
4347 if (unicode_resize(output, requiredsize) < 0)
4348 goto onError;
4349 }
4350 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4351 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 *endinpos = newpos;
4353 *inptr = *input + newpos;
4354
4355 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004356 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357 return 0;
4358
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004359 overflow:
4360 PyErr_SetString(PyExc_OverflowError,
4361 "decoded result is too long for a Python string");
4362
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 onError:
4364 Py_XDECREF(restuple);
4365 return -1;
4366}
Steve Dowercc16be82016-09-08 10:35:16 -07004367#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368
4369static int
4370unicode_decode_call_errorhandler_writer(
4371 const char *errors, PyObject **errorHandler,
4372 const char *encoding, const char *reason,
4373 const char **input, const char **inend, Py_ssize_t *startinpos,
4374 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4375 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4376{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004377 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004378
4379 PyObject *restuple = NULL;
4380 PyObject *repunicode = NULL;
4381 Py_ssize_t insize;
4382 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004383 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 PyObject *inputobj = NULL;
4385
4386 if (*errorHandler == NULL) {
4387 *errorHandler = PyCodec_LookupError(errors);
4388 if (*errorHandler == NULL)
4389 goto onError;
4390 }
4391
4392 make_decode_exception(exceptionObject,
4393 encoding,
4394 *input, *inend - *input,
4395 *startinpos, *endinpos,
4396 reason);
4397 if (*exceptionObject == NULL)
4398 goto onError;
4399
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004400 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 if (restuple == NULL)
4402 goto onError;
4403 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004404 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004405 goto onError;
4406 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004407 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004408 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004409
4410 /* Copy back the bytes variables, which might have been modified by the
4411 callback */
4412 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4413 if (!inputobj)
4414 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004415 *input = PyBytes_AS_STRING(inputobj);
4416 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004417 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004418 /* we can DECREF safely, as the exception has another reference,
4419 so the object won't go away. */
4420 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004424 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004425 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004426 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004427 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428
Victor Stinner170ca6f2013-04-18 00:25:28 +02004429 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004430 if (replen > 1) {
4431 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004432 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004433 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4434 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4435 goto onError;
4436 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004438 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004441 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004444 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450}
4451
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452/* --- UTF-7 Codec -------------------------------------------------------- */
4453
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454/* See RFC2152 for details. We encode conservatively and decode liberally. */
4455
4456/* Three simple macros defining base-64. */
4457
4458/* Is c a base-64 character? */
4459
4460#define IS_BASE64(c) \
4461 (((c) >= 'A' && (c) <= 'Z') || \
4462 ((c) >= 'a' && (c) <= 'z') || \
4463 ((c) >= '0' && (c) <= '9') || \
4464 (c) == '+' || (c) == '/')
4465
4466/* given that c is a base-64 character, what is its base-64 value? */
4467
4468#define FROM_BASE64(c) \
4469 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4470 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4471 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4472 (c) == '+' ? 62 : 63)
4473
4474/* What is the base-64 character of the bottom 6 bits of n? */
4475
4476#define TO_BASE64(n) \
4477 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4478
4479/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4480 * decoded as itself. We are permissive on decoding; the only ASCII
4481 * byte not decoding to itself is the + which begins a base64
4482 * string. */
4483
4484#define DECODE_DIRECT(c) \
4485 ((c) <= 127 && (c) != '+')
4486
4487/* The UTF-7 encoder treats ASCII characters differently according to
4488 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4489 * the above). See RFC2152. This array identifies these different
4490 * sets:
4491 * 0 : "Set D"
4492 * alphanumeric and '(),-./:?
4493 * 1 : "Set O"
4494 * !"#$%&*;<=>@[]^_`{|}
4495 * 2 : "whitespace"
4496 * ht nl cr sp
4497 * 3 : special (must be base64 encoded)
4498 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4499 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500
Tim Petersced69f82003-09-16 20:30:58 +00004501static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502char utf7_category[128] = {
4503/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4504 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4505/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4506 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4507/* sp ! " # $ % & ' ( ) * + , - . / */
4508 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4509/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4511/* @ A B C D E F G H I J K L M N O */
4512 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4513/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4515/* ` a b c d e f g h i j k l m n o */
4516 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4517/* p q r s t u v w x y z { | } ~ del */
4518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519};
4520
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521/* ENCODE_DIRECT: this character should be encoded as itself. The
4522 * answer depends on whether we are encoding set O as itself, and also
4523 * on whether we are encoding whitespace as itself. RFC2152 makes it
4524 * clear that the answers to these questions vary between
4525 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004526
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527#define ENCODE_DIRECT(c, directO, directWS) \
4528 ((c) < 128 && (c) > 0 && \
4529 ((utf7_category[(c)] == 0) || \
4530 (directWS && (utf7_category[(c)] == 2)) || \
4531 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532
Alexander Belopolsky40018472011-02-26 01:02:56 +00004533PyObject *
4534PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004535 Py_ssize_t size,
4536 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004538 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4539}
4540
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541/* The decoder. The only state we preserve is our read position,
4542 * i.e. how many characters we have consumed. So if we end in the
4543 * middle of a shift sequence we have to back off the read position
4544 * and the output to the beginning of the sequence, otherwise we lose
4545 * all the shift state (seen bits, number of bits seen, high
4546 * surrogate). */
4547
Alexander Belopolsky40018472011-02-26 01:02:56 +00004548PyObject *
4549PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004550 Py_ssize_t size,
4551 const char *errors,
4552 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004555 Py_ssize_t startinpos;
4556 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004558 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 const char *errmsg = "";
4560 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004561 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 unsigned int base64bits = 0;
4563 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004564 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 PyObject *errorHandler = NULL;
4566 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004568 if (size == 0) {
4569 if (consumed)
4570 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004571 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004572 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004575 _PyUnicodeWriter_Init(&writer);
4576 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577
4578 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 e = s + size;
4580
4581 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004582 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004584 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 if (inShift) { /* in a base-64 section */
4587 if (IS_BASE64(ch)) { /* consume a base-64 character */
4588 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4589 base64bits += 6;
4590 s++;
4591 if (base64bits >= 16) {
4592 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004593 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 base64bits -= 16;
4595 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004596 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 if (surrogate) {
4598 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004599 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4600 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004601 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004604 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 }
4606 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004607 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004608 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 }
4611 }
Victor Stinner551ac952011-11-29 22:58:13 +01004612 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 /* first surrogate */
4614 surrogate = outCh;
4615 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004617 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004618 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 }
4620 }
4621 }
4622 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 if (base64bits > 0) { /* left-over bits */
4625 if (base64bits >= 6) {
4626 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004627 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 errmsg = "partial character in shift sequence";
4629 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 else {
4632 /* Some bits remain; they should be zero */
4633 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004634 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 errmsg = "non-zero padding bits in shift sequence";
4636 goto utf7Error;
4637 }
4638 }
4639 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004640 if (surrogate && DECODE_DIRECT(ch)) {
4641 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4642 goto onError;
4643 }
4644 surrogate = 0;
4645 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004646 /* '-' is absorbed; other terminating
4647 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004648 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 }
4651 }
4652 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 s++; /* consume '+' */
4655 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004657 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004658 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 }
4660 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004662 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004665 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 }
4667 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004670 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 else {
4674 startinpos = s-starts;
4675 s++;
4676 errmsg = "unexpected special character";
4677 goto utf7Error;
4678 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004679 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004682 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 errors, &errorHandler,
4684 "utf7", errmsg,
4685 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688 }
4689
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 /* end of string */
4691
4692 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4693 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004694 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004695 if (surrogate ||
4696 (base64bits >= 6) ||
4697 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004699 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 errors, &errorHandler,
4701 "utf7", "unterminated shift sequence",
4702 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004703 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 goto onError;
4705 if (s < e)
4706 goto restart;
4707 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709
4710 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004711 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004713 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004714 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004715 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004716 writer.kind, writer.data, shiftOutStart);
4717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
4719 _PyUnicodeWriter_Dealloc(&writer);
4720 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004721 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004722 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004723 }
4724 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004725 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004727 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004729 Py_XDECREF(errorHandler);
4730 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004731 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004732
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 Py_XDECREF(errorHandler);
4735 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004737 return NULL;
4738}
4739
4740
Alexander Belopolsky40018472011-02-26 01:02:56 +00004741PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004742_PyUnicode_EncodeUTF7(PyObject *str,
4743 int base64SetO,
4744 int base64WhiteSpace,
4745 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004747 int kind;
4748 void *data;
4749 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004753 unsigned int base64bits = 0;
4754 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755 char * out;
4756 char * start;
4757
Benjamin Petersonbac79492012-01-14 13:34:47 -05004758 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004759 return NULL;
4760 kind = PyUnicode_KIND(str);
4761 data = PyUnicode_DATA(str);
4762 len = PyUnicode_GET_LENGTH(str);
4763
4764 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004767 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004768 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004769 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004770 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 if (v == NULL)
4772 return NULL;
4773
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004774 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004775 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004776 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004777
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 if (inShift) {
4779 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4780 /* shifting out */
4781 if (base64bits) { /* output remaining bits */
4782 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4783 base64buffer = 0;
4784 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 }
4786 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 /* Characters not in the BASE64 set implicitly unshift the sequence
4788 so no '-' is required, except if the character is itself a '-' */
4789 if (IS_BASE64(ch) || ch == '-') {
4790 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004791 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004792 *out++ = (char) ch;
4793 }
4794 else {
4795 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004796 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004797 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 else { /* not in a shift sequence */
4799 if (ch == '+') {
4800 *out++ = '+';
4801 *out++ = '-';
4802 }
4803 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4804 *out++ = (char) ch;
4805 }
4806 else {
4807 *out++ = '+';
4808 inShift = 1;
4809 goto encode_char;
4810 }
4811 }
4812 continue;
4813encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004815 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004816
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 /* code first surrogate */
4818 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004819 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820 while (base64bits >= 6) {
4821 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4822 base64bits -= 6;
4823 }
4824 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004825 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 base64bits += 16;
4828 base64buffer = (base64buffer << 16) | ch;
4829 while (base64bits >= 6) {
4830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4831 base64bits -= 6;
4832 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004833 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 if (base64bits)
4835 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4836 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004837 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004838 if (_PyBytes_Resize(&v, out - start) < 0)
4839 return NULL;
4840 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004841}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004842PyObject *
4843PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4844 Py_ssize_t size,
4845 int base64SetO,
4846 int base64WhiteSpace,
4847 const char *errors)
4848{
4849 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004850 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004851 if (tmp == NULL)
4852 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004853 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004854 base64WhiteSpace, errors);
4855 Py_DECREF(tmp);
4856 return result;
4857}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004858
Antoine Pitrou244651a2009-05-04 18:56:13 +00004859#undef IS_BASE64
4860#undef FROM_BASE64
4861#undef TO_BASE64
4862#undef DECODE_DIRECT
4863#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865/* --- UTF-8 Codec -------------------------------------------------------- */
4866
Alexander Belopolsky40018472011-02-26 01:02:56 +00004867PyObject *
4868PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004869 Py_ssize_t size,
4870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
Walter Dörwald69652032004-09-07 20:24:22 +00004872 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4873}
4874
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875#include "stringlib/asciilib.h"
4876#include "stringlib/codecs.h"
4877#include "stringlib/undef.h"
4878
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004879#include "stringlib/ucs1lib.h"
4880#include "stringlib/codecs.h"
4881#include "stringlib/undef.h"
4882
4883#include "stringlib/ucs2lib.h"
4884#include "stringlib/codecs.h"
4885#include "stringlib/undef.h"
4886
4887#include "stringlib/ucs4lib.h"
4888#include "stringlib/codecs.h"
4889#include "stringlib/undef.h"
4890
Antoine Pitrouab868312009-01-10 15:40:25 +00004891/* Mask to quickly check whether a C 'long' contains a
4892 non-ASCII, UTF8-encoded char. */
4893#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004894# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004895#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004896# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004897#else
4898# error C 'long' size should be either 4 or 8!
4899#endif
4900
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901static Py_ssize_t
4902ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004903{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004905 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004906
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004907 /*
4908 * Issue #17237: m68k is a bit different from most architectures in
4909 * that objects do not use "natural alignment" - for example, int and
4910 * long are only aligned at 2-byte boundaries. Therefore the assert()
4911 * won't work; also, tests have shown that skipping the "optimised
4912 * version" will even speed up m68k.
4913 */
4914#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004916 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4917 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 /* Fast path, see in STRINGLIB(utf8_decode) for
4919 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004920 /* Help allocation */
4921 const char *_p = p;
4922 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 while (_p < aligned_end) {
4924 unsigned long value = *(const unsigned long *) _p;
4925 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 *((unsigned long *)q) = value;
4928 _p += SIZEOF_LONG;
4929 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004930 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931 p = _p;
4932 while (p < end) {
4933 if ((unsigned char)*p & 0x80)
4934 break;
4935 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004940#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 while (p < end) {
4942 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4943 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004944 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004945 /* Help allocation */
4946 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 while (_p < aligned_end) {
4948 unsigned long value = *(unsigned long *) _p;
4949 if (value & ASCII_CHAR_MASK)
4950 break;
4951 _p += SIZEOF_LONG;
4952 }
4953 p = _p;
4954 if (_p == end)
4955 break;
4956 }
4957 if ((unsigned char)*p & 0x80)
4958 break;
4959 ++p;
4960 }
4961 memcpy(dest, start, p - start);
4962 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963}
Antoine Pitrouab868312009-01-10 15:40:25 +00004964
Victor Stinner785938e2011-12-11 20:09:03 +01004965PyObject *
4966PyUnicode_DecodeUTF8Stateful(const char *s,
4967 Py_ssize_t size,
4968 const char *errors,
4969 Py_ssize_t *consumed)
4970{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004971 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004972 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974
4975 Py_ssize_t startinpos;
4976 Py_ssize_t endinpos;
4977 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004978 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004980 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004981
4982 if (size == 0) {
4983 if (consumed)
4984 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004985 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004986 }
4987
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4989 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004990 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 *consumed = 1;
4992 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004993 }
4994
Victor Stinner8f674cc2013-04-17 23:02:17 +02004995 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004996 writer.min_length = size;
4997 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004998 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004999
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 writer.pos = ascii_decode(s, end, writer.data);
5001 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002 while (s < end) {
5003 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005004 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005005
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 if (PyUnicode_IS_ASCII(writer.buffer))
5008 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005010 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 } else {
5014 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 }
5017
5018 switch (ch) {
5019 case 0:
5020 if (s == end || consumed)
5021 goto End;
5022 errmsg = "unexpected end of data";
5023 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005024 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 break;
5026 case 1:
5027 errmsg = "invalid start byte";
5028 startinpos = s - starts;
5029 endinpos = startinpos + 1;
5030 break;
5031 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005032 case 3:
5033 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 errmsg = "invalid continuation byte";
5035 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005036 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 break;
5038 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005039 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040 goto onError;
5041 continue;
5042 }
5043
Victor Stinner1d65d912015-10-05 13:43:50 +02005044 if (error_handler == _Py_ERROR_UNKNOWN)
5045 error_handler = get_error_handler(errors);
5046
5047 switch (error_handler) {
5048 case _Py_ERROR_IGNORE:
5049 s += (endinpos - startinpos);
5050 break;
5051
5052 case _Py_ERROR_REPLACE:
5053 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5054 goto onError;
5055 s += (endinpos - startinpos);
5056 break;
5057
5058 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 {
5060 Py_ssize_t i;
5061
Victor Stinner1d65d912015-10-05 13:43:50 +02005062 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5063 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005064 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005065 ch = (Py_UCS4)(unsigned char)(starts[i]);
5066 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5067 ch + 0xdc00);
5068 writer.pos++;
5069 }
5070 s += (endinpos - startinpos);
5071 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005072 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005073
5074 default:
5075 if (unicode_decode_call_errorhandler_writer(
5076 errors, &error_handler_obj,
5077 "utf-8", errmsg,
5078 &starts, &end, &startinpos, &endinpos, &exc, &s,
5079 &writer))
5080 goto onError;
5081 }
Victor Stinner785938e2011-12-11 20:09:03 +01005082 }
5083
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 if (consumed)
5086 *consumed = s - starts;
5087
Victor Stinner1d65d912015-10-05 13:43:50 +02005088 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005090 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091
5092onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005093 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005095 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005097}
5098
Xavier de Gaye76febd02016-12-15 20:59:58 +01005099#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100
5101/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005102 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005103
5104 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005105 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106
5107wchar_t*
5108_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5109{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005111 wchar_t *unicode;
5112 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113
5114 /* Note: size will always be longer than the resulting Unicode
5115 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005116 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005118 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119 if (!unicode)
5120 return NULL;
5121
5122 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005131#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132 if (ch > 0xFF) {
5133#if SIZEOF_WCHAR_T == 4
5134 assert(0);
5135#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005136 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 /* compute and append the two surrogates: */
5138 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5139 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5140#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005141 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005142 else {
5143 if (!ch && s == e)
5144 break;
5145 /* surrogateescape */
5146 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5147 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150 return unicode;
5151}
5152
Xavier de Gaye76febd02016-12-15 20:59:58 +01005153#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155/* Primary internal function which creates utf8 encoded bytes objects.
5156
5157 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005158 and allocate exactly as much space needed at the end. Else allocate the
5159 maximum possible needed (4 result bytes per Unicode character), and return
5160 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005161*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005162PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005163_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164{
Victor Stinner6099a032011-12-18 14:22:26 +01005165 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 void *data;
5167 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005169 if (!PyUnicode_Check(unicode)) {
5170 PyErr_BadArgument();
5171 return NULL;
5172 }
5173
5174 if (PyUnicode_READY(unicode) == -1)
5175 return NULL;
5176
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005177 if (PyUnicode_UTF8(unicode))
5178 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5179 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005180
5181 kind = PyUnicode_KIND(unicode);
5182 data = PyUnicode_DATA(unicode);
5183 size = PyUnicode_GET_LENGTH(unicode);
5184
Benjamin Petersonead6b532011-12-20 17:23:42 -06005185 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005186 default:
5187 assert(0);
5188 case PyUnicode_1BYTE_KIND:
5189 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5190 assert(!PyUnicode_IS_ASCII(unicode));
5191 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5192 case PyUnicode_2BYTE_KIND:
5193 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5194 case PyUnicode_4BYTE_KIND:
5195 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197}
5198
Alexander Belopolsky40018472011-02-26 01:02:56 +00005199PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005200PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5201 Py_ssize_t size,
5202 const char *errors)
5203{
5204 PyObject *v, *unicode;
5205
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005206 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005207 if (unicode == NULL)
5208 return NULL;
5209 v = _PyUnicode_AsUTF8String(unicode, errors);
5210 Py_DECREF(unicode);
5211 return v;
5212}
5213
5214PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005215PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005217 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218}
5219
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220/* --- UTF-32 Codec ------------------------------------------------------- */
5221
5222PyObject *
5223PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 Py_ssize_t size,
5225 const char *errors,
5226 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005227{
5228 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5229}
5230
5231PyObject *
5232PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 Py_ssize_t size,
5234 const char *errors,
5235 int *byteorder,
5236 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005237{
5238 const char *starts = s;
5239 Py_ssize_t startinpos;
5240 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005242 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005243 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005244 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 PyObject *errorHandler = NULL;
5247 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005248
Walter Dörwald41980ca2007-08-16 21:55:45 +00005249 q = (unsigned char *)s;
5250 e = q + size;
5251
5252 if (byteorder)
5253 bo = *byteorder;
5254
5255 /* Check for BOM marks (U+FEFF) in the input and adjust current
5256 byte order setting accordingly. In native mode, the leading BOM
5257 mark is skipped, in all other modes, it is copied to the output
5258 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005259 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005260 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005261 if (bom == 0x0000FEFF) {
5262 bo = -1;
5263 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005265 else if (bom == 0xFFFE0000) {
5266 bo = 1;
5267 q += 4;
5268 }
5269 if (byteorder)
5270 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271 }
5272
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 if (q == e) {
5274 if (consumed)
5275 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005276 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005277 }
5278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279#ifdef WORDS_BIGENDIAN
5280 le = bo < 0;
5281#else
5282 le = bo <= 0;
5283#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005284 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005285
Victor Stinner8f674cc2013-04-17 23:02:17 +02005286 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005287 writer.min_length = (e - q + 3) / 4;
5288 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005289 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005290
Victor Stinnere64322e2012-10-30 23:12:47 +01005291 while (1) {
5292 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005293 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005294
Victor Stinnere64322e2012-10-30 23:12:47 +01005295 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005296 enum PyUnicode_Kind kind = writer.kind;
5297 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005298 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005300 if (le) {
5301 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005302 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (ch > maxch)
5304 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005305 if (kind != PyUnicode_1BYTE_KIND &&
5306 Py_UNICODE_IS_SURROGATE(ch))
5307 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 q += 4;
5310 } while (q <= last);
5311 }
5312 else {
5313 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005314 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 if (ch > maxch)
5316 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005317 if (kind != PyUnicode_1BYTE_KIND &&
5318 Py_UNICODE_IS_SURROGATE(ch))
5319 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005320 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 q += 4;
5322 } while (q <= last);
5323 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005324 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005325 }
5326
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005327 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005328 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005329 startinpos = ((const char *)q) - starts;
5330 endinpos = startinpos + 4;
5331 }
5332 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005335 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005337 startinpos = ((const char *)q) - starts;
5338 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005340 else {
5341 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005342 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 goto onError;
5344 q += 4;
5345 continue;
5346 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005347 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005348 startinpos = ((const char *)q) - starts;
5349 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005351
5352 /* The remaining input chars are ignored if the callback
5353 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005356 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 }
5361
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364
Walter Dörwald41980ca2007-08-16 21:55:45 +00005365 Py_XDECREF(errorHandler);
5366 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
5373 return NULL;
5374}
5375
5376PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005377_PyUnicode_EncodeUTF32(PyObject *str,
5378 const char *errors,
5379 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005381 enum PyUnicode_Kind kind;
5382 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005384 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005385 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005386#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005388#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005389 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005390#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005391 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005392 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005393 PyObject *errorHandler = NULL;
5394 PyObject *exc = NULL;
5395 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005397 if (!PyUnicode_Check(str)) {
5398 PyErr_BadArgument();
5399 return NULL;
5400 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005401 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402 return NULL;
5403 kind = PyUnicode_KIND(str);
5404 data = PyUnicode_DATA(str);
5405 len = PyUnicode_GET_LENGTH(str);
5406
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005407 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005408 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005410 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005411 if (v == NULL)
5412 return NULL;
5413
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 /* output buffer is 4-bytes aligned */
5415 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005416 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005418 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005423 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005426 else
5427 encoding = "utf-32";
5428
5429 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5431 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005432 }
5433
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005434 pos = 0;
5435 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005436 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005437
5438 if (kind == PyUnicode_2BYTE_KIND) {
5439 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5440 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005441 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005442 else {
5443 assert(kind == PyUnicode_4BYTE_KIND);
5444 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5445 &out, native_ordering);
5446 }
5447 if (pos == len)
5448 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005449
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005450 rep = unicode_encode_call_errorhandler(
5451 errors, &errorHandler,
5452 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005453 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005454 if (!rep)
5455 goto error;
5456
5457 if (PyBytes_Check(rep)) {
5458 repsize = PyBytes_GET_SIZE(rep);
5459 if (repsize & 3) {
5460 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 "surrogates not allowed");
5463 goto error;
5464 }
5465 moreunits = repsize / 4;
5466 }
5467 else {
5468 assert(PyUnicode_Check(rep));
5469 if (PyUnicode_READY(rep) < 0)
5470 goto error;
5471 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5472 if (!PyUnicode_IS_ASCII(rep)) {
5473 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005474 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005475 "surrogates not allowed");
5476 goto error;
5477 }
5478 }
5479
5480 /* four bytes are reserved for each surrogate */
5481 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005482 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005483 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005484 /* integer overflow */
5485 PyErr_NoMemory();
5486 goto error;
5487 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005488 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005490 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 }
5492
5493 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005494 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005495 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005498 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5499 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005500 }
5501
5502 Py_CLEAR(rep);
5503 }
5504
5505 /* Cut back to size actually needed. This is necessary for, for example,
5506 encoding of a string containing isolated surrogates and the 'ignore'
5507 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005508 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005509 if (nsize != PyBytes_GET_SIZE(v))
5510 _PyBytes_Resize(&v, nsize);
5511 Py_XDECREF(errorHandler);
5512 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005513 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005514 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 error:
5516 Py_XDECREF(rep);
5517 Py_XDECREF(errorHandler);
5518 Py_XDECREF(exc);
5519 Py_XDECREF(v);
5520 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005521}
5522
Alexander Belopolsky40018472011-02-26 01:02:56 +00005523PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005524PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5525 Py_ssize_t size,
5526 const char *errors,
5527 int byteorder)
5528{
5529 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005530 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005531 if (tmp == NULL)
5532 return NULL;
5533 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5534 Py_DECREF(tmp);
5535 return result;
5536}
5537
5538PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005539PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540{
Victor Stinnerb960b342011-11-20 19:12:52 +01005541 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542}
5543
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544/* --- UTF-16 Codec ------------------------------------------------------- */
5545
Tim Peters772747b2001-08-09 22:21:55 +00005546PyObject *
5547PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 Py_ssize_t size,
5549 const char *errors,
5550 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551{
Walter Dörwald69652032004-09-07 20:24:22 +00005552 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5553}
5554
5555PyObject *
5556PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 Py_ssize_t size,
5558 const char *errors,
5559 int *byteorder,
5560 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t startinpos;
5564 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005565 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005567 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005568 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005569 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 PyObject *errorHandler = NULL;
5571 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005572 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Tim Peters772747b2001-08-09 22:21:55 +00005574 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005575 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
5577 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005578 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005580 /* Check for BOM marks (U+FEFF) in the input and adjust current
5581 byte order setting accordingly. In native mode, the leading BOM
5582 mark is skipped, in all other modes, it is copied to the output
5583 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005584 if (bo == 0 && size >= 2) {
5585 const Py_UCS4 bom = (q[1] << 8) | q[0];
5586 if (bom == 0xFEFF) {
5587 q += 2;
5588 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 else if (bom == 0xFFFE) {
5591 q += 2;
5592 bo = 1;
5593 }
5594 if (byteorder)
5595 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
Antoine Pitrou63065d72012-05-15 23:48:04 +02005598 if (q == e) {
5599 if (consumed)
5600 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005601 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005602 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005603
Christian Heimes743e0cd2012-10-17 23:52:17 +02005604#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005605 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005606 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005607#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005608 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005609 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005610#endif
Tim Peters772747b2001-08-09 22:21:55 +00005611
Antoine Pitrou63065d72012-05-15 23:48:04 +02005612 /* Note: size will always be longer than the resulting Unicode
5613 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005614 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005615 writer.min_length = (e - q + 1) / 2;
5616 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005617 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618
Antoine Pitrou63065d72012-05-15 23:48:04 +02005619 while (1) {
5620 Py_UCS4 ch = 0;
5621 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005623 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005626 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627 native_ordering);
5628 else
5629 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005631 native_ordering);
5632 } else if (kind == PyUnicode_2BYTE_KIND) {
5633 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005634 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005635 native_ordering);
5636 } else {
5637 assert(kind == PyUnicode_4BYTE_KIND);
5638 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005640 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005641 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644 switch (ch)
5645 {
5646 case 0:
5647 /* remaining byte at the end? (size should be even) */
5648 if (q == e || consumed)
5649 goto End;
5650 errmsg = "truncated data";
5651 startinpos = ((const char *)q) - starts;
5652 endinpos = ((const char *)e) - starts;
5653 break;
5654 /* The remaining input chars are ignored if the callback
5655 chooses to skip the input */
5656 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005657 q -= 2;
5658 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005659 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005660 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005661 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005662 endinpos = ((const char *)e) - starts;
5663 break;
5664 case 2:
5665 errmsg = "illegal encoding";
5666 startinpos = ((const char *)q) - 2 - starts;
5667 endinpos = startinpos + 2;
5668 break;
5669 case 3:
5670 errmsg = "illegal UTF-16 surrogate";
5671 startinpos = ((const char *)q) - 4 - starts;
5672 endinpos = startinpos + 2;
5673 break;
5674 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005675 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 continue;
5678 }
5679
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005680 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005681 errors,
5682 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005683 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005684 &starts,
5685 (const char **)&e,
5686 &startinpos,
5687 &endinpos,
5688 &exc,
5689 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005690 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 }
5693
Antoine Pitrou63065d72012-05-15 23:48:04 +02005694End:
Walter Dörwald69652032004-09-07 20:24:22 +00005695 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 Py_XDECREF(errorHandler);
5699 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 Py_XDECREF(errorHandler);
5705 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 return NULL;
5707}
5708
Tim Peters772747b2001-08-09 22:21:55 +00005709PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005710_PyUnicode_EncodeUTF16(PyObject *str,
5711 const char *errors,
5712 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005714 enum PyUnicode_Kind kind;
5715 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005717 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005719 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005720#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005721 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005722#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005723 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005724#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 const char *encoding;
5726 Py_ssize_t nsize, pos;
5727 PyObject *errorHandler = NULL;
5728 PyObject *exc = NULL;
5729 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 if (!PyUnicode_Check(str)) {
5732 PyErr_BadArgument();
5733 return NULL;
5734 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005735 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 return NULL;
5737 kind = PyUnicode_KIND(str);
5738 data = PyUnicode_DATA(str);
5739 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005740
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005742 if (kind == PyUnicode_4BYTE_KIND) {
5743 const Py_UCS4 *in = (const Py_UCS4 *)data;
5744 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005745 while (in < end) {
5746 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005748 }
5749 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005750 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005751 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005753 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005754 nsize = len + pairs + (byteorder == 0);
5755 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005756 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005760 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005761 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005762 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005764 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
5766 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005767 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005768 }
Tim Peters772747b2001-08-09 22:21:55 +00005769
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 if (kind == PyUnicode_1BYTE_KIND) {
5771 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5772 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005773 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005775 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005776 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005777 }
5778 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005780 }
5781 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005782 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005783 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005784
5785 pos = 0;
5786 while (pos < len) {
5787 Py_ssize_t repsize, moreunits;
5788
5789 if (kind == PyUnicode_2BYTE_KIND) {
5790 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5791 &out, native_ordering);
5792 }
5793 else {
5794 assert(kind == PyUnicode_4BYTE_KIND);
5795 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5796 &out, native_ordering);
5797 }
5798 if (pos == len)
5799 break;
5800
5801 rep = unicode_encode_call_errorhandler(
5802 errors, &errorHandler,
5803 encoding, "surrogates not allowed",
5804 str, &exc, pos, pos + 1, &pos);
5805 if (!rep)
5806 goto error;
5807
5808 if (PyBytes_Check(rep)) {
5809 repsize = PyBytes_GET_SIZE(rep);
5810 if (repsize & 1) {
5811 raise_encode_exception(&exc, encoding,
5812 str, pos - 1, pos,
5813 "surrogates not allowed");
5814 goto error;
5815 }
5816 moreunits = repsize / 2;
5817 }
5818 else {
5819 assert(PyUnicode_Check(rep));
5820 if (PyUnicode_READY(rep) < 0)
5821 goto error;
5822 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5823 if (!PyUnicode_IS_ASCII(rep)) {
5824 raise_encode_exception(&exc, encoding,
5825 str, pos - 1, pos,
5826 "surrogates not allowed");
5827 goto error;
5828 }
5829 }
5830
5831 /* two bytes are reserved for each surrogate */
5832 if (moreunits > 1) {
5833 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005834 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 /* integer overflow */
5836 PyErr_NoMemory();
5837 goto error;
5838 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005839 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005840 goto error;
5841 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5842 }
5843
5844 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005845 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 out += moreunits;
5847 } else /* rep is unicode */ {
5848 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5849 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5850 &out, native_ordering);
5851 }
5852
5853 Py_CLEAR(rep);
5854 }
5855
5856 /* Cut back to size actually needed. This is necessary for, for example,
5857 encoding of a string containing isolated surrogates and the 'ignore' handler
5858 is used. */
5859 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5860 if (nsize != PyBytes_GET_SIZE(v))
5861 _PyBytes_Resize(&v, nsize);
5862 Py_XDECREF(errorHandler);
5863 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005864 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005865 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005866 error:
5867 Py_XDECREF(rep);
5868 Py_XDECREF(errorHandler);
5869 Py_XDECREF(exc);
5870 Py_XDECREF(v);
5871 return NULL;
5872#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873}
5874
Alexander Belopolsky40018472011-02-26 01:02:56 +00005875PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5877 Py_ssize_t size,
5878 const char *errors,
5879 int byteorder)
5880{
5881 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005882 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 if (tmp == NULL)
5884 return NULL;
5885 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5886 Py_DECREF(tmp);
5887 return result;
5888}
5889
5890PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005891PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894}
5895
5896/* --- Unicode Escape Codec ----------------------------------------------- */
5897
Fredrik Lundh06d12682001-01-24 07:59:11 +00005898static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005899
Alexander Belopolsky40018472011-02-26 01:02:56 +00005900PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005901_PyUnicode_DecodeUnicodeEscape(const char *s,
5902 Py_ssize_t size,
5903 const char *errors,
5904 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005907 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 PyObject *errorHandler = NULL;
5910 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005911
Eric V. Smith42454af2016-10-31 09:22:08 -04005912 // so we can remember if we've seen an invalid escape char or not
5913 *first_invalid_escape = NULL;
5914
Victor Stinner62ec3312016-09-06 17:04:34 -07005915 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005916 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005917 }
5918 /* Escaped strings will always be longer than the resulting
5919 Unicode string, so we start with size here and then reduce the
5920 length after conversion to the true value.
5921 (but if the error callback returns a long replacement string
5922 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005923 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 writer.min_length = size;
5925 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5926 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005927 }
5928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 end = s + size;
5930 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005931 unsigned char c = (unsigned char) *s++;
5932 Py_UCS4 ch;
5933 int count;
5934 Py_ssize_t startinpos;
5935 Py_ssize_t endinpos;
5936 const char *message;
5937
5938#define WRITE_ASCII_CHAR(ch) \
5939 do { \
5940 assert(ch <= 127); \
5941 assert(writer.pos < writer.size); \
5942 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5943 } while(0)
5944
5945#define WRITE_CHAR(ch) \
5946 do { \
5947 if (ch <= writer.maxchar) { \
5948 assert(writer.pos < writer.size); \
5949 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5950 } \
5951 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5952 goto onError; \
5953 } \
5954 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005957 if (c != '\\') {
5958 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 continue;
5960 }
5961
Victor Stinner62ec3312016-09-06 17:04:34 -07005962 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 if (s >= end) {
5965 message = "\\ at end of string";
5966 goto error;
5967 }
5968 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969
Victor Stinner62ec3312016-09-06 17:04:34 -07005970 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005971 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005974 case '\n': continue;
5975 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5976 case '\'': WRITE_ASCII_CHAR('\''); continue;
5977 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5978 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005979 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005980 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5981 case 't': WRITE_ASCII_CHAR('\t'); continue;
5982 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5983 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005984 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005985 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005987 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 case '0': case '1': case '2': case '3':
5991 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005993 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 ch = (ch<<3) + *s++ - '0';
5995 if (s < end && '0' <= *s && *s <= '7') {
5996 ch = (ch<<3) + *s++ - '0';
5997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 WRITE_CHAR(ch);
6000 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 /* hex escapes */
6003 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006005 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006006 message = "truncated \\xXX escape";
6007 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006011 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006012 message = "truncated \\uXXXX escape";
6013 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006016 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006017 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006018 message = "truncated \\UXXXXXXXX escape";
6019 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006021 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006022 ch <<= 4;
6023 if (c >= '0' && c <= '9') {
6024 ch += c - '0';
6025 }
6026 else if (c >= 'a' && c <= 'f') {
6027 ch += c - ('a' - 10);
6028 }
6029 else if (c >= 'A' && c <= 'F') {
6030 ch += c - ('A' - 10);
6031 }
6032 else {
6033 break;
6034 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006035 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006036 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006037 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006038 }
6039
6040 /* when we get here, ch is a 32-bit unicode character */
6041 if (ch > MAX_UNICODE) {
6042 message = "illegal Unicode character";
6043 goto error;
6044 }
6045
6046 WRITE_CHAR(ch);
6047 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006051 if (ucnhash_CAPI == NULL) {
6052 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6054 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006055 if (ucnhash_CAPI == NULL) {
6056 PyErr_SetString(
6057 PyExc_UnicodeError,
6058 "\\N escapes not supported (can't load unicodedata module)"
6059 );
6060 goto onError;
6061 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006063
6064 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006065 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 const char *start = ++s;
6067 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006068 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006069 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006070 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006071 namelen = s - start;
6072 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 ch = 0xffffffff; /* in case 'getcode' messes up */
6076 if (namelen <= INT_MAX &&
6077 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6078 &ch, 0)) {
6079 assert(ch <= MAX_UNICODE);
6080 WRITE_CHAR(ch);
6081 continue;
6082 }
6083 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006084 }
6085 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006086 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006087
6088 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006089 if (*first_invalid_escape == NULL) {
6090 *first_invalid_escape = s-1; /* Back up one char, since we've
6091 already incremented s. */
6092 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006093 WRITE_ASCII_CHAR('\\');
6094 WRITE_CHAR(c);
6095 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006097
6098 error:
6099 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006100 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006101 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006102 errors, &errorHandler,
6103 "unicodeescape", message,
6104 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006105 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006106 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 }
6108 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6109 goto onError;
6110 }
6111
6112#undef WRITE_ASCII_CHAR
6113#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006115
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006116 Py_XDECREF(errorHandler);
6117 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006119
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006121 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 Py_XDECREF(errorHandler);
6123 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return NULL;
6125}
6126
Eric V. Smith42454af2016-10-31 09:22:08 -04006127PyObject *
6128PyUnicode_DecodeUnicodeEscape(const char *s,
6129 Py_ssize_t size,
6130 const char *errors)
6131{
6132 const char *first_invalid_escape;
6133 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6134 &first_invalid_escape);
6135 if (result == NULL)
6136 return NULL;
6137 if (first_invalid_escape != NULL) {
6138 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6139 "invalid escape sequence '\\%c'",
6140 *first_invalid_escape) < 0) {
6141 Py_DECREF(result);
6142 return NULL;
6143 }
6144 }
6145 return result;
6146}
6147
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006148/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Alexander Belopolsky40018472011-02-26 01:02:56 +00006150PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006157 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Ezio Melottie7f90372012-10-05 03:33:31 +03006160 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006161 escape.
6162
Ezio Melottie7f90372012-10-05 03:33:31 +03006163 For UCS1 strings it's '\xxx', 4 bytes per source character.
6164 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6165 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006166 */
6167
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 if (!PyUnicode_Check(unicode)) {
6169 PyErr_BadArgument();
6170 return NULL;
6171 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006174 }
Victor Stinner358af132015-10-12 22:36:57 +02006175
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 if (len == 0) {
6178 return PyBytes_FromStringAndSize(NULL, 0);
6179 }
6180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 kind = PyUnicode_KIND(unicode);
6182 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6184 bytes, and 1 byte characters 4. */
6185 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006186 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 return PyErr_NoMemory();
6188 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006189 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 if (repr == NULL) {
6191 return NULL;
6192 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006196 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006197
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 /* U+0000-U+00ff range */
6199 if (ch < 0x100) {
6200 if (ch >= ' ' && ch < 127) {
6201 if (ch != '\\') {
6202 /* Copy printable US ASCII as-is */
6203 *p++ = (char) ch;
6204 }
6205 /* Escape backslashes */
6206 else {
6207 *p++ = '\\';
6208 *p++ = '\\';
6209 }
6210 }
Victor Stinner358af132015-10-12 22:36:57 +02006211
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 /* Map special whitespace to '\t', \n', '\r' */
6213 else if (ch == '\t') {
6214 *p++ = '\\';
6215 *p++ = 't';
6216 }
6217 else if (ch == '\n') {
6218 *p++ = '\\';
6219 *p++ = 'n';
6220 }
6221 else if (ch == '\r') {
6222 *p++ = '\\';
6223 *p++ = 'r';
6224 }
6225
6226 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6227 else {
6228 *p++ = '\\';
6229 *p++ = 'x';
6230 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6231 *p++ = Py_hexdigits[ch & 0x000F];
6232 }
Tim Petersced69f82003-09-16 20:30:58 +00006233 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006234 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 *p++ = '\\';
6237 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006238 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6240 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6241 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6244 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006245
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 /* Make sure that the first two digits are zero */
6247 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006248 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 *p++ = 'U';
6250 *p++ = '0';
6251 *p++ = '0';
6252 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6257 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 assert(p - PyBytes_AS_STRING(repr) > 0);
6262 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6263 return NULL;
6264 }
6265 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266}
6267
Alexander Belopolsky40018472011-02-26 01:02:56 +00006268PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006269PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6270 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006272 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006273 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 }
6277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 result = PyUnicode_AsUnicodeEscapeString(tmp);
6279 Py_DECREF(tmp);
6280 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281}
6282
6283/* --- Raw Unicode Escape Codec ------------------------------------------- */
6284
Alexander Belopolsky40018472011-02-26 01:02:56 +00006285PyObject *
6286PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006287 Py_ssize_t size,
6288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006291 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 PyObject *errorHandler = NULL;
6294 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006295
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006297 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 /* Escaped strings will always be longer than the resulting
6301 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006302 length after conversion to the true value. (But decoding error
6303 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006304 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 writer.min_length = size;
6306 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6307 goto onError;
6308 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006309
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 end = s + size;
6311 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 unsigned char c = (unsigned char) *s++;
6313 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006314 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 Py_ssize_t startinpos;
6316 Py_ssize_t endinpos;
6317 const char *message;
6318
6319#define WRITE_CHAR(ch) \
6320 do { \
6321 if (ch <= writer.maxchar) { \
6322 assert(writer.pos < writer.size); \
6323 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6324 } \
6325 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6326 goto onError; \
6327 } \
6328 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 if (c != '\\' || s >= end) {
6332 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006335
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 c = (unsigned char) *s++;
6337 if (c == 'u') {
6338 count = 4;
6339 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 else if (c == 'U') {
6342 count = 8;
6343 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 }
6345 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 assert(writer.pos < writer.size);
6347 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6348 WRITE_CHAR(c);
6349 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006350 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 startinpos = s - starts - 2;
6352
6353 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6354 for (ch = 0; count && s < end; ++s, --count) {
6355 c = (unsigned char)*s;
6356 ch <<= 4;
6357 if (c >= '0' && c <= '9') {
6358 ch += c - '0';
6359 }
6360 else if (c >= 'a' && c <= 'f') {
6361 ch += c - ('a' - 10);
6362 }
6363 else if (c >= 'A' && c <= 'F') {
6364 ch += c - ('A' - 10);
6365 }
6366 else {
6367 break;
6368 }
6369 }
6370 if (!count) {
6371 if (ch <= MAX_UNICODE) {
6372 WRITE_CHAR(ch);
6373 continue;
6374 }
6375 message = "\\Uxxxxxxxx out of range";
6376 }
6377
6378 endinpos = s-starts;
6379 writer.min_length = end - s + writer.pos;
6380 if (unicode_decode_call_errorhandler_writer(
6381 errors, &errorHandler,
6382 "rawunicodeescape", message,
6383 &starts, &end, &startinpos, &endinpos, &exc, &s,
6384 &writer)) {
6385 goto onError;
6386 }
6387 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6388 goto onError;
6389 }
6390
6391#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403}
6404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408{
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412 int kind;
6413 void *data;
6414 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 if (!PyUnicode_Check(unicode)) {
6417 PyErr_BadArgument();
6418 return NULL;
6419 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 kind = PyUnicode_KIND(unicode);
6424 data = PyUnicode_DATA(unicode);
6425 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 if (kind == PyUnicode_1BYTE_KIND) {
6427 return PyBytes_FromStringAndSize(data, len);
6428 }
Victor Stinner0e368262011-11-10 20:12:49 +01006429
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6431 bytes, and 1 byte characters 4. */
6432 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006433
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 if (len > PY_SSIZE_T_MAX / expandsize) {
6435 return PyErr_NoMemory();
6436 }
6437 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6438 if (repr == NULL) {
6439 return NULL;
6440 }
6441 if (len == 0) {
6442 return repr;
6443 }
6444
6445 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 for (pos = 0; pos < len; pos++) {
6447 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006448
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6450 if (ch < 0x100) {
6451 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006452 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6454 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 *p++ = '\\';
6456 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006457 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6460 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006462 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6463 else {
6464 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6465 *p++ = '\\';
6466 *p++ = 'U';
6467 *p++ = '0';
6468 *p++ = '0';
6469 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6474 *p++ = Py_hexdigits[ch & 15];
6475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006477
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 assert(p > PyBytes_AS_STRING(repr));
6479 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6480 return NULL;
6481 }
6482 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483}
6484
Alexander Belopolsky40018472011-02-26 01:02:56 +00006485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6487 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006490 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006492 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6494 Py_DECREF(tmp);
6495 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006498/* --- Unicode Internal Codec ------------------------------------------- */
6499
Alexander Belopolsky40018472011-02-26 01:02:56 +00006500PyObject *
6501_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006502 Py_ssize_t size,
6503 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006504{
6505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006506 Py_ssize_t startinpos;
6507 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006508 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006509 const char *end;
6510 const char *reason;
6511 PyObject *errorHandler = NULL;
6512 PyObject *exc = NULL;
6513
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006514 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006515 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006516 1))
6517 return NULL;
6518
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006519 if (size < 0) {
6520 PyErr_BadInternalCall();
6521 return NULL;
6522 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006523 if (size == 0)
6524 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006525
Victor Stinner8f674cc2013-04-17 23:02:17 +02006526 _PyUnicodeWriter_Init(&writer);
6527 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6528 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006530 }
6531 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006535 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006536 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006537 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006538 endinpos = end-starts;
6539 reason = "truncated input";
6540 goto error;
6541 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006542 /* We copy the raw representation one byte at a time because the
6543 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006544 ((char *) &uch)[0] = s[0];
6545 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006546#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[2] = s[2];
6548 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006551#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 /* We have to sanity check the raw data, otherwise doom looms for
6553 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006554 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006555 endinpos = s - starts + Py_UNICODE_SIZE;
6556 reason = "illegal code point (> 0x10FFFF)";
6557 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006558 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006559#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006560 s += Py_UNICODE_SIZE;
6561#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006562 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006564 Py_UNICODE uch2;
6565 ((char *) &uch2)[0] = s[0];
6566 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006567 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006568 {
Victor Stinner551ac952011-11-29 22:58:13 +01006569 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006570 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006571 }
6572 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573#endif
6574
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006575 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006577 continue;
6578
6579 error:
6580 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006581 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006582 errors, &errorHandler,
6583 "unicode_internal", reason,
6584 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006585 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006586 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006587 }
6588
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006589 Py_XDECREF(errorHandler);
6590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006591 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595 Py_XDECREF(errorHandler);
6596 Py_XDECREF(exc);
6597 return NULL;
6598}
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600/* --- Latin-1 Codec ------------------------------------------------------ */
6601
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602PyObject *
6603PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006604 Py_ssize_t size,
6605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006608 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609}
6610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006611/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612static void
6613make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006615 PyObject *unicode,
6616 Py_ssize_t startpos, Py_ssize_t endpos,
6617 const char *reason)
6618{
6619 if (*exceptionObject == NULL) {
6620 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006621 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006622 encoding, unicode, startpos, endpos, reason);
6623 }
6624 else {
6625 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6626 goto onError;
6627 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6628 goto onError;
6629 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6630 goto onError;
6631 return;
6632 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006633 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006634 }
6635}
6636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638static void
6639raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006641 PyObject *unicode,
6642 Py_ssize_t startpos, Py_ssize_t endpos,
6643 const char *reason)
6644{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006645 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006646 encoding, unicode, startpos, endpos, reason);
6647 if (*exceptionObject != NULL)
6648 PyCodec_StrictErrors(*exceptionObject);
6649}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650
6651/* error handling callback helper:
6652 build arguments, call the callback and check the arguments,
6653 put the result into newpos and return the replacement string, which
6654 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655static PyObject *
6656unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 PyObject **errorHandler,
6658 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 Py_ssize_t startpos, Py_ssize_t endpos,
6661 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006663 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 PyObject *restuple;
6666 PyObject *resunicode;
6667
6668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 }
6673
Benjamin Petersonbac79492012-01-14 13:34:47 -05006674 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675 return NULL;
6676 len = PyUnicode_GET_LENGTH(unicode);
6677
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006678 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006683 restuple = PyObject_CallFunctionObjArgs(
6684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006688 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 Py_DECREF(restuple);
6690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006692 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 &resunicode, newpos)) {
6694 Py_DECREF(restuple);
6695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006697 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6698 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6699 Py_DECREF(restuple);
6700 return NULL;
6701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006703 *newpos = len + *newpos;
6704 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006705 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 Py_DECREF(restuple);
6707 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_INCREF(resunicode);
6710 Py_DECREF(restuple);
6711 return resunicode;
6712}
6713
Alexander Belopolsky40018472011-02-26 01:02:56 +00006714static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006716 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006717 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 /* input state */
6720 Py_ssize_t pos=0, size;
6721 int kind;
6722 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 /* pointer into the output */
6724 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006725 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6726 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006727 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006729 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006730 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006731 /* output object */
6732 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733
Benjamin Petersonbac79492012-01-14 13:34:47 -05006734 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 return NULL;
6736 size = PyUnicode_GET_LENGTH(unicode);
6737 kind = PyUnicode_KIND(unicode);
6738 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 /* allocate enough for a simple encoding without
6740 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006741 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006742 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006743
6744 _PyBytesWriter_Init(&writer);
6745 str = _PyBytesWriter_Alloc(&writer, size);
6746 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006749 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006750 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006755 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006757 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006759 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006762 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006764
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006765 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006767
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006768 /* Only overallocate the buffer if it's not the last write */
6769 writer.overallocate = (collend < size);
6770
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006772 if (error_handler == _Py_ERROR_UNKNOWN)
6773 error_handler = get_error_handler(errors);
6774
6775 switch (error_handler) {
6776 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006777 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006779
6780 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006781 memset(str, '?', collend - collstart);
6782 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006783 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006784 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 break;
Victor Stinner50149202015-09-22 00:26:54 +02006787
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006788 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006789 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006790 writer.min_size -= (collend - collstart);
6791 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006792 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006793 if (str == NULL)
6794 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 pos = collend;
6796 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006797
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006799 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006800 writer.min_size -= (collend - collstart);
6801 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006802 unicode, collstart, collend);
6803 if (str == NULL)
6804 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 break;
Victor Stinner50149202015-09-22 00:26:54 +02006807
Victor Stinnerc3713e92015-09-29 12:32:13 +02006808 case _Py_ERROR_SURROGATEESCAPE:
6809 for (i = collstart; i < collend; ++i) {
6810 ch = PyUnicode_READ(kind, data, i);
6811 if (ch < 0xdc80 || 0xdcff < ch) {
6812 /* Not a UTF-8b surrogate */
6813 break;
6814 }
6815 *str++ = (char)(ch - 0xdc00);
6816 ++pos;
6817 }
6818 if (i >= collend)
6819 break;
6820 collstart = pos;
6821 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006822 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006823
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006825 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6826 encoding, reason, unicode, &exc,
6827 collstart, collend, &newpos);
6828 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006830
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006831 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006832 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006833
Victor Stinner6bd525b2015-10-09 13:10:05 +02006834 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006835 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006836 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 PyBytes_AS_STRING(rep),
6838 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006839 if (str == NULL)
6840 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006841 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 else {
6843 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006844
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006847
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006848 if (limit == 256 ?
6849 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6850 !PyUnicode_IS_ASCII(rep))
6851 {
6852 /* Not all characters are smaller than limit */
6853 raise_encode_exception(&exc, encoding, unicode,
6854 collstart, collend, reason);
6855 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006857 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6858 str = _PyBytesWriter_WriteBytes(&writer, str,
6859 PyUnicode_DATA(rep),
6860 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006862 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006863 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006864 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006865
6866 /* If overallocation was disabled, ensure that it was the last
6867 write. Otherwise, we missed an optimization */
6868 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006869 }
6870 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006871
Victor Stinner50149202015-09-22 00:26:54 +02006872 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006874 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006875
6876 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006877 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006878 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006879 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006880 Py_XDECREF(exc);
6881 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006882}
6883
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006884/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006885PyObject *
6886PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006887 Py_ssize_t size,
6888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006891 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006892 if (unicode == NULL)
6893 return NULL;
6894 result = unicode_encode_ucs1(unicode, errors, 256);
6895 Py_DECREF(unicode);
6896 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
Alexander Belopolsky40018472011-02-26 01:02:56 +00006899PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006900_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901{
6902 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 PyErr_BadArgument();
6904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006906 if (PyUnicode_READY(unicode) == -1)
6907 return NULL;
6908 /* Fast path: if it is a one-byte string, construct
6909 bytes object directly. */
6910 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6911 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6912 PyUnicode_GET_LENGTH(unicode));
6913 /* Non-Latin-1 characters present. Defer to above function to
6914 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006915 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006916}
6917
6918PyObject*
6919PyUnicode_AsLatin1String(PyObject *unicode)
6920{
6921 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922}
6923
6924/* --- 7-bit ASCII Codec -------------------------------------------------- */
6925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_DecodeASCII(const char *s,
6928 Py_ssize_t size,
6929 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006932 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006933 int kind;
6934 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006935 Py_ssize_t startinpos;
6936 Py_ssize_t endinpos;
6937 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006939 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006941 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006942
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006944 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006947 if (size == 1 && (unsigned char)s[0] < 128)
6948 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006949
Victor Stinner8f674cc2013-04-17 23:02:17 +02006950 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006951 writer.min_length = size;
6952 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006953 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006956 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006957 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006958 writer.pos = outpos;
6959 if (writer.pos == size)
6960 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006961
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 s += writer.pos;
6963 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006965 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006967 PyUnicode_WRITE(kind, data, writer.pos, c);
6968 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006970 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006972
6973 /* byte outsize range 0x00..0x7f: call the error handler */
6974
6975 if (error_handler == _Py_ERROR_UNKNOWN)
6976 error_handler = get_error_handler(errors);
6977
6978 switch (error_handler)
6979 {
6980 case _Py_ERROR_REPLACE:
6981 case _Py_ERROR_SURROGATEESCAPE:
6982 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006983 but we may switch to UCS2 at the first write */
6984 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6985 goto onError;
6986 kind = writer.kind;
6987 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006988
6989 if (error_handler == _Py_ERROR_REPLACE)
6990 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6991 else
6992 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6993 writer.pos++;
6994 ++s;
6995 break;
6996
6997 case _Py_ERROR_IGNORE:
6998 ++s;
6999 break;
7000
7001 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 startinpos = s-starts;
7003 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007005 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 "ascii", "ordinal not in range(128)",
7007 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007010 kind = writer.kind;
7011 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007014 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007017
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007020 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 return NULL;
7023}
7024
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007025/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007026PyObject *
7027PyUnicode_EncodeASCII(const Py_UNICODE *p,
7028 Py_ssize_t size,
7029 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007032 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033 if (unicode == NULL)
7034 return NULL;
7035 result = unicode_encode_ucs1(unicode, errors, 128);
7036 Py_DECREF(unicode);
7037 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042{
7043 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 PyErr_BadArgument();
7045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007047 if (PyUnicode_READY(unicode) == -1)
7048 return NULL;
7049 /* Fast path: if it is an ASCII-only string, construct bytes object
7050 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007051 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007052 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7053 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007054 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055}
7056
7057PyObject *
7058PyUnicode_AsASCIIString(PyObject *unicode)
7059{
7060 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061}
7062
Steve Dowercc16be82016-09-08 10:35:16 -07007063#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007064
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007065/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007066
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007067#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#define NEED_RETRY
7069#endif
7070
Victor Stinner3a50e702011-10-18 21:21:00 +02007071#ifndef WC_ERR_INVALID_CHARS
7072# define WC_ERR_INVALID_CHARS 0x0080
7073#endif
7074
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007075static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007076code_page_name(UINT code_page, PyObject **obj)
7077{
7078 *obj = NULL;
7079 if (code_page == CP_ACP)
7080 return "mbcs";
7081 if (code_page == CP_UTF7)
7082 return "CP_UTF7";
7083 if (code_page == CP_UTF8)
7084 return "CP_UTF8";
7085
7086 *obj = PyBytes_FromFormat("cp%u", code_page);
7087 if (*obj == NULL)
7088 return NULL;
7089 return PyBytes_AS_STRING(*obj);
7090}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091
Victor Stinner3a50e702011-10-18 21:21:00 +02007092static DWORD
7093decode_code_page_flags(UINT code_page)
7094{
7095 if (code_page == CP_UTF7) {
7096 /* The CP_UTF7 decoder only supports flags=0 */
7097 return 0;
7098 }
7099 else
7100 return MB_ERR_INVALID_CHARS;
7101}
7102
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 * Decode a byte string from a Windows code page into unicode object in strict
7105 * mode.
7106 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007107 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7108 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007111decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007112 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 const char *in,
7114 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115{
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007117 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119
7120 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 assert(insize > 0);
7122 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7123 if (outsize <= 0)
7124 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125
7126 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007128 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007129 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 if (*v == NULL)
7131 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133 }
7134 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007137 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140 }
7141
7142 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7144 if (outsize <= 0)
7145 goto error;
7146 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007147
Victor Stinner3a50e702011-10-18 21:21:00 +02007148error:
7149 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7150 return -2;
7151 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153}
7154
Victor Stinner3a50e702011-10-18 21:21:00 +02007155/*
7156 * Decode a byte string from a code page into unicode object with an error
7157 * handler.
7158 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007159 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 * UnicodeDecodeError exception and returns -1 on error.
7161 */
7162static int
7163decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007164 PyObject **v,
7165 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007166 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007167{
7168 const char *startin = in;
7169 const char *endin = in + size;
7170 const DWORD flags = decode_code_page_flags(code_page);
7171 /* Ideally, we should get reason from FormatMessage. This is the Windows
7172 2000 English version of the message. */
7173 const char *reason = "No mapping for the Unicode character exists "
7174 "in the target code page.";
7175 /* each step cannot decode more than 1 character, but a character can be
7176 represented as a surrogate pair */
7177 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007178 int insize;
7179 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 PyObject *errorHandler = NULL;
7181 PyObject *exc = NULL;
7182 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007183 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 DWORD err;
7185 int ret = -1;
7186
7187 assert(size > 0);
7188
7189 encoding = code_page_name(code_page, &encoding_obj);
7190 if (encoding == NULL)
7191 return -1;
7192
Victor Stinner7d00cc12014-03-17 23:08:06 +01007193 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7195 UnicodeDecodeError. */
7196 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7197 if (exc != NULL) {
7198 PyCodec_StrictErrors(exc);
7199 Py_CLEAR(exc);
7200 }
7201 goto error;
7202 }
7203
7204 if (*v == NULL) {
7205 /* Create unicode object */
7206 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7207 PyErr_NoMemory();
7208 goto error;
7209 }
Victor Stinnerab595942011-12-17 04:59:06 +01007210 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 if (*v == NULL)
7213 goto error;
7214 startout = PyUnicode_AS_UNICODE(*v);
7215 }
7216 else {
7217 /* Extend unicode object */
7218 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7219 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7220 PyErr_NoMemory();
7221 goto error;
7222 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007223 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 goto error;
7225 startout = PyUnicode_AS_UNICODE(*v) + n;
7226 }
7227
7228 /* Decode the byte string character per character */
7229 out = startout;
7230 while (in < endin)
7231 {
7232 /* Decode a character */
7233 insize = 1;
7234 do
7235 {
7236 outsize = MultiByteToWideChar(code_page, flags,
7237 in, insize,
7238 buffer, Py_ARRAY_LENGTH(buffer));
7239 if (outsize > 0)
7240 break;
7241 err = GetLastError();
7242 if (err != ERROR_NO_UNICODE_TRANSLATION
7243 && err != ERROR_INSUFFICIENT_BUFFER)
7244 {
7245 PyErr_SetFromWindowsErr(0);
7246 goto error;
7247 }
7248 insize++;
7249 }
7250 /* 4=maximum length of a UTF-8 sequence */
7251 while (insize <= 4 && (in + insize) <= endin);
7252
7253 if (outsize <= 0) {
7254 Py_ssize_t startinpos, endinpos, outpos;
7255
Victor Stinner7d00cc12014-03-17 23:08:06 +01007256 /* last character in partial decode? */
7257 if (in + insize >= endin && !final)
7258 break;
7259
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 startinpos = in - startin;
7261 endinpos = startinpos + 1;
7262 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007263 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 errors, &errorHandler,
7265 encoding, reason,
7266 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007267 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 {
7269 goto error;
7270 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007271 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 }
7273 else {
7274 in += insize;
7275 memcpy(out, buffer, outsize * sizeof(wchar_t));
7276 out += outsize;
7277 }
7278 }
7279
7280 /* write a NUL character at the end */
7281 *out = 0;
7282
7283 /* Extend unicode object */
7284 outsize = out - startout;
7285 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007286 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007288 /* (in - startin) <= size and size is an int */
7289 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007290
7291error:
7292 Py_XDECREF(encoding_obj);
7293 Py_XDECREF(errorHandler);
7294 Py_XDECREF(exc);
7295 return ret;
7296}
7297
Victor Stinner3a50e702011-10-18 21:21:00 +02007298static PyObject *
7299decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007300 const char *s, Py_ssize_t size,
7301 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302{
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 PyObject *v = NULL;
7304 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 if (code_page < 0) {
7307 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7308 return NULL;
7309 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007310 if (size < 0) {
7311 PyErr_BadInternalCall();
7312 return NULL;
7313 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007314
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317
Victor Stinner76a31a62011-11-04 00:05:13 +01007318 do
7319 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 if (size > INT_MAX) {
7322 chunk_size = INT_MAX;
7323 final = 0;
7324 done = 0;
7325 }
7326 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 {
7329 chunk_size = (int)size;
7330 final = (consumed == NULL);
7331 done = 1;
7332 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 if (chunk_size == 0 && done) {
7335 if (v != NULL)
7336 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007337 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 converted = decode_code_page_strict(code_page, &v,
7341 s, chunk_size);
7342 if (converted == -2)
7343 converted = decode_code_page_errors(code_page, &v,
7344 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007345 errors, final);
7346 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007347
7348 if (converted < 0) {
7349 Py_XDECREF(v);
7350 return NULL;
7351 }
7352
7353 if (consumed)
7354 *consumed += converted;
7355
7356 s += converted;
7357 size -= converted;
7358 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007359
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007360 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007361}
7362
Alexander Belopolsky40018472011-02-26 01:02:56 +00007363PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007364PyUnicode_DecodeCodePageStateful(int code_page,
7365 const char *s,
7366 Py_ssize_t size,
7367 const char *errors,
7368 Py_ssize_t *consumed)
7369{
7370 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7371}
7372
7373PyObject *
7374PyUnicode_DecodeMBCSStateful(const char *s,
7375 Py_ssize_t size,
7376 const char *errors,
7377 Py_ssize_t *consumed)
7378{
7379 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7380}
7381
7382PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383PyUnicode_DecodeMBCS(const char *s,
7384 Py_ssize_t size,
7385 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007386{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7388}
7389
Victor Stinner3a50e702011-10-18 21:21:00 +02007390static DWORD
7391encode_code_page_flags(UINT code_page, const char *errors)
7392{
7393 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007394 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 }
7396 else if (code_page == CP_UTF7) {
7397 /* CP_UTF7 only supports flags=0 */
7398 return 0;
7399 }
7400 else {
7401 if (errors != NULL && strcmp(errors, "replace") == 0)
7402 return 0;
7403 else
7404 return WC_NO_BEST_FIT_CHARS;
7405 }
7406}
7407
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007408/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 * Encode a Unicode string to a Windows code page into a byte string in strict
7410 * mode.
7411 *
7412 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007413 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007414 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007415static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007416encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419{
Victor Stinner554f3f02010-06-16 23:33:54 +00007420 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 BOOL *pusedDefaultChar = &usedDefaultChar;
7422 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007423 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007424 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 const DWORD flags = encode_code_page_flags(code_page, NULL);
7426 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 /* Create a substring so that we can get the UTF-16 representation
7428 of just the slice under consideration. */
7429 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007430
Martin v. Löwis3d325192011-11-04 18:23:06 +01007431 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007432
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007434 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007436 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007437
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 substring = PyUnicode_Substring(unicode, offset, offset+len);
7439 if (substring == NULL)
7440 return -1;
7441 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7442 if (p == NULL) {
7443 Py_DECREF(substring);
7444 return -1;
7445 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007446 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007447
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007448 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007450 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 NULL, 0,
7452 NULL, pusedDefaultChar);
7453 if (outsize <= 0)
7454 goto error;
7455 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 if (pusedDefaultChar && *pusedDefaultChar) {
7457 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007464 if (*outbytes == NULL) {
7465 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469 }
7470 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 const Py_ssize_t n = PyBytes_Size(*outbytes);
7473 if (outsize > PY_SSIZE_T_MAX - n) {
7474 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007475 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007478 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7479 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007483 }
7484
7485 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007487 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 out, outsize,
7489 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 if (outsize <= 0)
7492 goto error;
7493 if (pusedDefaultChar && *pusedDefaultChar)
7494 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007495 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007496
Victor Stinner3a50e702011-10-18 21:21:00 +02007497error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7500 return -2;
7501 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007502 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007503}
7504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007506 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 * error handler.
7508 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007509 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 * -1 on other error.
7511 */
7512static int
7513encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007514 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007515 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007516{
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007518 Py_ssize_t pos = unicode_offset;
7519 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 /* Ideally, we should get reason from FormatMessage. This is the Windows
7521 2000 English version of the message. */
7522 const char *reason = "invalid character";
7523 /* 4=maximum length of a UTF-8 sequence */
7524 char buffer[4];
7525 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7526 Py_ssize_t outsize;
7527 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 PyObject *errorHandler = NULL;
7529 PyObject *exc = NULL;
7530 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007531 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007532 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 PyObject *rep;
7534 int ret = -1;
7535
7536 assert(insize > 0);
7537
7538 encoding = code_page_name(code_page, &encoding_obj);
7539 if (encoding == NULL)
7540 return -1;
7541
7542 if (errors == NULL || strcmp(errors, "strict") == 0) {
7543 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7544 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007545 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 if (exc != NULL) {
7547 PyCodec_StrictErrors(exc);
7548 Py_DECREF(exc);
7549 }
7550 Py_XDECREF(encoding_obj);
7551 return -1;
7552 }
7553
7554 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7555 pusedDefaultChar = &usedDefaultChar;
7556 else
7557 pusedDefaultChar = NULL;
7558
7559 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7560 PyErr_NoMemory();
7561 goto error;
7562 }
7563 outsize = insize * Py_ARRAY_LENGTH(buffer);
7564
7565 if (*outbytes == NULL) {
7566 /* Create string object */
7567 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7568 if (*outbytes == NULL)
7569 goto error;
7570 out = PyBytes_AS_STRING(*outbytes);
7571 }
7572 else {
7573 /* Extend string object */
7574 Py_ssize_t n = PyBytes_Size(*outbytes);
7575 if (n > PY_SSIZE_T_MAX - outsize) {
7576 PyErr_NoMemory();
7577 goto error;
7578 }
7579 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7580 goto error;
7581 out = PyBytes_AS_STRING(*outbytes) + n;
7582 }
7583
7584 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007585 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007587 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7588 wchar_t chars[2];
7589 int charsize;
7590 if (ch < 0x10000) {
7591 chars[0] = (wchar_t)ch;
7592 charsize = 1;
7593 }
7594 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007595 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7596 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007597 charsize = 2;
7598 }
7599
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007601 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 buffer, Py_ARRAY_LENGTH(buffer),
7603 NULL, pusedDefaultChar);
7604 if (outsize > 0) {
7605 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7606 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007607 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 memcpy(out, buffer, outsize);
7609 out += outsize;
7610 continue;
7611 }
7612 }
7613 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7614 PyErr_SetFromWindowsErr(0);
7615 goto error;
7616 }
7617
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 rep = unicode_encode_call_errorhandler(
7619 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007620 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007621 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 if (rep == NULL)
7623 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007625
7626 if (PyBytes_Check(rep)) {
7627 outsize = PyBytes_GET_SIZE(rep);
7628 if (outsize != 1) {
7629 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7630 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7631 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7632 Py_DECREF(rep);
7633 goto error;
7634 }
7635 out = PyBytes_AS_STRING(*outbytes) + offset;
7636 }
7637 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7638 out += outsize;
7639 }
7640 else {
7641 Py_ssize_t i;
7642 enum PyUnicode_Kind kind;
7643 void *data;
7644
Benjamin Petersonbac79492012-01-14 13:34:47 -05007645 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 Py_DECREF(rep);
7647 goto error;
7648 }
7649
7650 outsize = PyUnicode_GET_LENGTH(rep);
7651 if (outsize != 1) {
7652 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7653 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7654 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7655 Py_DECREF(rep);
7656 goto error;
7657 }
7658 out = PyBytes_AS_STRING(*outbytes) + offset;
7659 }
7660 kind = PyUnicode_KIND(rep);
7661 data = PyUnicode_DATA(rep);
7662 for (i=0; i < outsize; i++) {
7663 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7664 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007665 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007666 encoding, unicode,
7667 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 "unable to encode error handler result to ASCII");
7669 Py_DECREF(rep);
7670 goto error;
7671 }
7672 *out = (unsigned char)ch;
7673 out++;
7674 }
7675 }
7676 Py_DECREF(rep);
7677 }
7678 /* write a NUL byte */
7679 *out = 0;
7680 outsize = out - PyBytes_AS_STRING(*outbytes);
7681 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7682 if (_PyBytes_Resize(outbytes, outsize) < 0)
7683 goto error;
7684 ret = 0;
7685
7686error:
7687 Py_XDECREF(encoding_obj);
7688 Py_XDECREF(errorHandler);
7689 Py_XDECREF(exc);
7690 return ret;
7691}
7692
Victor Stinner3a50e702011-10-18 21:21:00 +02007693static PyObject *
7694encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007695 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 const char *errors)
7697{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007698 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007700 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007701 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007702
Victor Stinner29dacf22015-01-26 16:41:32 +01007703 if (!PyUnicode_Check(unicode)) {
7704 PyErr_BadArgument();
7705 return NULL;
7706 }
7707
Benjamin Petersonbac79492012-01-14 13:34:47 -05007708 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007709 return NULL;
7710 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007711
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 if (code_page < 0) {
7713 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7714 return NULL;
7715 }
7716
Martin v. Löwis3d325192011-11-04 18:23:06 +01007717 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007718 return PyBytes_FromStringAndSize(NULL, 0);
7719
Victor Stinner7581cef2011-11-03 22:32:33 +01007720 offset = 0;
7721 do
7722 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007723#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007724 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 chunks. */
7726 if (len > INT_MAX/2) {
7727 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007728 done = 0;
7729 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007730 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007731#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007734 done = 1;
7735 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007736
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007739 errors);
7740 if (ret == -2)
7741 ret = encode_code_page_errors(code_page, &outbytes,
7742 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 if (ret < 0) {
7745 Py_XDECREF(outbytes);
7746 return NULL;
7747 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007748
Victor Stinner7581cef2011-11-03 22:32:33 +01007749 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007750 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007751 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007752
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 return outbytes;
7754}
7755
7756PyObject *
7757PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7758 Py_ssize_t size,
7759 const char *errors)
7760{
Victor Stinner7581cef2011-11-03 22:32:33 +01007761 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007762 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007763 if (unicode == NULL)
7764 return NULL;
7765 res = encode_code_page(CP_ACP, unicode, errors);
7766 Py_DECREF(unicode);
7767 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007768}
7769
7770PyObject *
7771PyUnicode_EncodeCodePage(int code_page,
7772 PyObject *unicode,
7773 const char *errors)
7774{
Victor Stinner7581cef2011-11-03 22:32:33 +01007775 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007776}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007777
Alexander Belopolsky40018472011-02-26 01:02:56 +00007778PyObject *
7779PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007780{
Victor Stinner7581cef2011-11-03 22:32:33 +01007781 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007782}
7783
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007784#undef NEED_RETRY
7785
Steve Dowercc16be82016-09-08 10:35:16 -07007786#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007787
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788/* --- Character Mapping Codec -------------------------------------------- */
7789
Victor Stinnerfb161b12013-04-18 01:44:27 +02007790static int
7791charmap_decode_string(const char *s,
7792 Py_ssize_t size,
7793 PyObject *mapping,
7794 const char *errors,
7795 _PyUnicodeWriter *writer)
7796{
7797 const char *starts = s;
7798 const char *e;
7799 Py_ssize_t startinpos, endinpos;
7800 PyObject *errorHandler = NULL, *exc = NULL;
7801 Py_ssize_t maplen;
7802 enum PyUnicode_Kind mapkind;
7803 void *mapdata;
7804 Py_UCS4 x;
7805 unsigned char ch;
7806
7807 if (PyUnicode_READY(mapping) == -1)
7808 return -1;
7809
7810 maplen = PyUnicode_GET_LENGTH(mapping);
7811 mapdata = PyUnicode_DATA(mapping);
7812 mapkind = PyUnicode_KIND(mapping);
7813
7814 e = s + size;
7815
7816 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7817 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7818 * is disabled in encoding aliases, latin1 is preferred because
7819 * its implementation is faster. */
7820 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7821 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7822 Py_UCS4 maxchar = writer->maxchar;
7823
7824 assert (writer->kind == PyUnicode_1BYTE_KIND);
7825 while (s < e) {
7826 ch = *s;
7827 x = mapdata_ucs1[ch];
7828 if (x > maxchar) {
7829 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7830 goto onError;
7831 maxchar = writer->maxchar;
7832 outdata = (Py_UCS1 *)writer->data;
7833 }
7834 outdata[writer->pos] = x;
7835 writer->pos++;
7836 ++s;
7837 }
7838 return 0;
7839 }
7840
7841 while (s < e) {
7842 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7843 enum PyUnicode_Kind outkind = writer->kind;
7844 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7845 if (outkind == PyUnicode_1BYTE_KIND) {
7846 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7847 Py_UCS4 maxchar = writer->maxchar;
7848 while (s < e) {
7849 ch = *s;
7850 x = mapdata_ucs2[ch];
7851 if (x > maxchar)
7852 goto Error;
7853 outdata[writer->pos] = x;
7854 writer->pos++;
7855 ++s;
7856 }
7857 break;
7858 }
7859 else if (outkind == PyUnicode_2BYTE_KIND) {
7860 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7861 while (s < e) {
7862 ch = *s;
7863 x = mapdata_ucs2[ch];
7864 if (x == 0xFFFE)
7865 goto Error;
7866 outdata[writer->pos] = x;
7867 writer->pos++;
7868 ++s;
7869 }
7870 break;
7871 }
7872 }
7873 ch = *s;
7874
7875 if (ch < maplen)
7876 x = PyUnicode_READ(mapkind, mapdata, ch);
7877 else
7878 x = 0xfffe; /* invalid value */
7879Error:
7880 if (x == 0xfffe)
7881 {
7882 /* undefined mapping */
7883 startinpos = s-starts;
7884 endinpos = startinpos+1;
7885 if (unicode_decode_call_errorhandler_writer(
7886 errors, &errorHandler,
7887 "charmap", "character maps to <undefined>",
7888 &starts, &e, &startinpos, &endinpos, &exc, &s,
7889 writer)) {
7890 goto onError;
7891 }
7892 continue;
7893 }
7894
7895 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7896 goto onError;
7897 ++s;
7898 }
7899 Py_XDECREF(errorHandler);
7900 Py_XDECREF(exc);
7901 return 0;
7902
7903onError:
7904 Py_XDECREF(errorHandler);
7905 Py_XDECREF(exc);
7906 return -1;
7907}
7908
7909static int
7910charmap_decode_mapping(const char *s,
7911 Py_ssize_t size,
7912 PyObject *mapping,
7913 const char *errors,
7914 _PyUnicodeWriter *writer)
7915{
7916 const char *starts = s;
7917 const char *e;
7918 Py_ssize_t startinpos, endinpos;
7919 PyObject *errorHandler = NULL, *exc = NULL;
7920 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007921 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007922
7923 e = s + size;
7924
7925 while (s < e) {
7926 ch = *s;
7927
7928 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7929 key = PyLong_FromLong((long)ch);
7930 if (key == NULL)
7931 goto onError;
7932
7933 item = PyObject_GetItem(mapping, key);
7934 Py_DECREF(key);
7935 if (item == NULL) {
7936 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7937 /* No mapping found means: mapping is undefined. */
7938 PyErr_Clear();
7939 goto Undefined;
7940 } else
7941 goto onError;
7942 }
7943
7944 /* Apply mapping */
7945 if (item == Py_None)
7946 goto Undefined;
7947 if (PyLong_Check(item)) {
7948 long value = PyLong_AS_LONG(item);
7949 if (value == 0xFFFE)
7950 goto Undefined;
7951 if (value < 0 || value > MAX_UNICODE) {
7952 PyErr_Format(PyExc_TypeError,
7953 "character mapping must be in range(0x%lx)",
7954 (unsigned long)MAX_UNICODE + 1);
7955 goto onError;
7956 }
7957
7958 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7959 goto onError;
7960 }
7961 else if (PyUnicode_Check(item)) {
7962 if (PyUnicode_READY(item) == -1)
7963 goto onError;
7964 if (PyUnicode_GET_LENGTH(item) == 1) {
7965 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7966 if (value == 0xFFFE)
7967 goto Undefined;
7968 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7969 goto onError;
7970 }
7971 else {
7972 writer->overallocate = 1;
7973 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7974 goto onError;
7975 }
7976 }
7977 else {
7978 /* wrong return value */
7979 PyErr_SetString(PyExc_TypeError,
7980 "character mapping must return integer, None or str");
7981 goto onError;
7982 }
7983 Py_CLEAR(item);
7984 ++s;
7985 continue;
7986
7987Undefined:
7988 /* undefined mapping */
7989 Py_CLEAR(item);
7990 startinpos = s-starts;
7991 endinpos = startinpos+1;
7992 if (unicode_decode_call_errorhandler_writer(
7993 errors, &errorHandler,
7994 "charmap", "character maps to <undefined>",
7995 &starts, &e, &startinpos, &endinpos, &exc, &s,
7996 writer)) {
7997 goto onError;
7998 }
7999 }
8000 Py_XDECREF(errorHandler);
8001 Py_XDECREF(exc);
8002 return 0;
8003
8004onError:
8005 Py_XDECREF(item);
8006 Py_XDECREF(errorHandler);
8007 Py_XDECREF(exc);
8008 return -1;
8009}
8010
Alexander Belopolsky40018472011-02-26 01:02:56 +00008011PyObject *
8012PyUnicode_DecodeCharmap(const char *s,
8013 Py_ssize_t size,
8014 PyObject *mapping,
8015 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008017 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 /* Default to Latin-1 */
8020 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008024 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008025 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008026 writer.min_length = size;
8027 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008029
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008030 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008031 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8032 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008033 }
8034 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008035 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8036 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008038 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008039
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008041 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 return NULL;
8043}
8044
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045/* Charmap encoding: the lookup table */
8046
Alexander Belopolsky40018472011-02-26 01:02:56 +00008047struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 PyObject_HEAD
8049 unsigned char level1[32];
8050 int count2, count3;
8051 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052};
8053
8054static PyObject*
8055encoding_map_size(PyObject *obj, PyObject* args)
8056{
8057 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060}
8061
8062static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008063 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 PyDoc_STR("Return the size (in bytes) of this object") },
8065 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066};
8067
8068static void
8069encoding_map_dealloc(PyObject* o)
8070{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072}
8073
8074static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 "EncodingMap", /*tp_name*/
8077 sizeof(struct encoding_map), /*tp_basicsize*/
8078 0, /*tp_itemsize*/
8079 /* methods */
8080 encoding_map_dealloc, /*tp_dealloc*/
8081 0, /*tp_print*/
8082 0, /*tp_getattr*/
8083 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008084 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 0, /*tp_repr*/
8086 0, /*tp_as_number*/
8087 0, /*tp_as_sequence*/
8088 0, /*tp_as_mapping*/
8089 0, /*tp_hash*/
8090 0, /*tp_call*/
8091 0, /*tp_str*/
8092 0, /*tp_getattro*/
8093 0, /*tp_setattro*/
8094 0, /*tp_as_buffer*/
8095 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8096 0, /*tp_doc*/
8097 0, /*tp_traverse*/
8098 0, /*tp_clear*/
8099 0, /*tp_richcompare*/
8100 0, /*tp_weaklistoffset*/
8101 0, /*tp_iter*/
8102 0, /*tp_iternext*/
8103 encoding_map_methods, /*tp_methods*/
8104 0, /*tp_members*/
8105 0, /*tp_getset*/
8106 0, /*tp_base*/
8107 0, /*tp_dict*/
8108 0, /*tp_descr_get*/
8109 0, /*tp_descr_set*/
8110 0, /*tp_dictoffset*/
8111 0, /*tp_init*/
8112 0, /*tp_alloc*/
8113 0, /*tp_new*/
8114 0, /*tp_free*/
8115 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116};
8117
8118PyObject*
8119PyUnicode_BuildEncodingMap(PyObject* string)
8120{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 PyObject *result;
8122 struct encoding_map *mresult;
8123 int i;
8124 int need_dict = 0;
8125 unsigned char level1[32];
8126 unsigned char level2[512];
8127 unsigned char *mlevel1, *mlevel2, *mlevel3;
8128 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 int kind;
8130 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008131 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 PyErr_BadArgument();
8136 return NULL;
8137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 kind = PyUnicode_KIND(string);
8139 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008140 length = PyUnicode_GET_LENGTH(string);
8141 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 memset(level1, 0xFF, sizeof level1);
8143 memset(level2, 0xFF, sizeof level2);
8144
8145 /* If there isn't a one-to-one mapping of NULL to \0,
8146 or if there are non-BMP characters, we need to use
8147 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008150 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 ch = PyUnicode_READ(kind, data, i);
8153 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 need_dict = 1;
8155 break;
8156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158 /* unmapped character */
8159 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 l1 = ch >> 11;
8161 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 if (level1[l1] == 0xFF)
8163 level1[l1] = count2++;
8164 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 }
8167
8168 if (count2 >= 0xFF || count3 >= 0xFF)
8169 need_dict = 1;
8170
8171 if (need_dict) {
8172 PyObject *result = PyDict_New();
8173 PyObject *key, *value;
8174 if (!result)
8175 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008176 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008178 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179 if (!key || !value)
8180 goto failed1;
8181 if (PyDict_SetItem(result, key, value) == -1)
8182 goto failed1;
8183 Py_DECREF(key);
8184 Py_DECREF(value);
8185 }
8186 return result;
8187 failed1:
8188 Py_XDECREF(key);
8189 Py_XDECREF(value);
8190 Py_DECREF(result);
8191 return NULL;
8192 }
8193
8194 /* Create a three-level trie */
8195 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8196 16*count2 + 128*count3 - 1);
8197 if (!result)
8198 return PyErr_NoMemory();
8199 PyObject_Init(result, &EncodingMapType);
8200 mresult = (struct encoding_map*)result;
8201 mresult->count2 = count2;
8202 mresult->count3 = count3;
8203 mlevel1 = mresult->level1;
8204 mlevel2 = mresult->level23;
8205 mlevel3 = mresult->level23 + 16*count2;
8206 memcpy(mlevel1, level1, 32);
8207 memset(mlevel2, 0xFF, 16*count2);
8208 memset(mlevel3, 0, 128*count3);
8209 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008210 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008212 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8213 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 /* unmapped character */
8215 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008216 o1 = ch>>11;
8217 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 i2 = 16*mlevel1[o1] + o2;
8219 if (mlevel2[i2] == 0xFF)
8220 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008221 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 i3 = 128*mlevel2[i2] + o3;
8223 mlevel3[i3] = i;
8224 }
8225 return result;
8226}
8227
8228static int
Victor Stinner22168992011-11-20 17:09:18 +01008229encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230{
8231 struct encoding_map *map = (struct encoding_map*)mapping;
8232 int l1 = c>>11;
8233 int l2 = (c>>7) & 0xF;
8234 int l3 = c & 0x7F;
8235 int i;
8236
Victor Stinner22168992011-11-20 17:09:18 +01008237 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239 if (c == 0)
8240 return 0;
8241 /* level 1*/
8242 i = map->level1[l1];
8243 if (i == 0xFF) {
8244 return -1;
8245 }
8246 /* level 2*/
8247 i = map->level23[16*i+l2];
8248 if (i == 0xFF) {
8249 return -1;
8250 }
8251 /* level 3 */
8252 i = map->level23[16*map->count2 + 128*i + l3];
8253 if (i == 0) {
8254 return -1;
8255 }
8256 return i;
8257}
8258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259/* Lookup the character ch in the mapping. If the character
8260 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008261 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008262static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008263charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264{
Christian Heimes217cfd12007-12-02 14:31:20 +00008265 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 PyObject *x;
8267
8268 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 x = PyObject_GetItem(mapping, w);
8271 Py_DECREF(w);
8272 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8274 /* No mapping found means: mapping is undefined. */
8275 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008276 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 } else
8278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008280 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008282 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 long value = PyLong_AS_LONG(x);
8284 if (value < 0 || value > 255) {
8285 PyErr_SetString(PyExc_TypeError,
8286 "character mapping must be in range(256)");
8287 Py_DECREF(x);
8288 return NULL;
8289 }
8290 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008292 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 /* wrong return value */
8296 PyErr_Format(PyExc_TypeError,
8297 "character mapping must return integer, bytes or None, not %.400s",
8298 x->ob_type->tp_name);
8299 Py_DECREF(x);
8300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 }
8302}
8303
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008305charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008307 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8308 /* exponentially overallocate to minimize reallocations */
8309 if (requiredsize < 2*outsize)
8310 requiredsize = 2*outsize;
8311 if (_PyBytes_Resize(outobj, requiredsize))
8312 return -1;
8313 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314}
8315
Benjamin Peterson14339b62009-01-31 16:36:08 +00008316typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008318} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008320 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 space is available. Return a new reference to the object that
8322 was put in the output buffer, or Py_None, if the mapping was undefined
8323 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008324 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008326charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 PyObject *rep;
8330 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008331 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332
Christian Heimes90aa7642007-12-19 02:45:37 +00008333 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336 if (res == -1)
8337 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 if (outsize<requiredsize)
8339 if (charmapencode_resize(outobj, outpos, requiredsize))
8340 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008341 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 outstart[(*outpos)++] = (char)res;
8343 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 }
8345
8346 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 Py_DECREF(rep);
8351 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 if (PyLong_Check(rep)) {
8354 Py_ssize_t requiredsize = *outpos+1;
8355 if (outsize<requiredsize)
8356 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8357 Py_DECREF(rep);
8358 return enc_EXCEPTION;
8359 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008360 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 else {
8364 const char *repchars = PyBytes_AS_STRING(rep);
8365 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8366 Py_ssize_t requiredsize = *outpos+repsize;
8367 if (outsize<requiredsize)
8368 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8369 Py_DECREF(rep);
8370 return enc_EXCEPTION;
8371 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008372 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 memcpy(outstart + *outpos, repchars, repsize);
8374 *outpos += repsize;
8375 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 Py_DECREF(rep);
8378 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379}
8380
8381/* handle an error in PyUnicode_EncodeCharmap
8382 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008383static int
8384charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008385 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008387 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008388 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389{
8390 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008391 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008392 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008393 enum PyUnicode_Kind kind;
8394 void *data;
8395 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008397 Py_ssize_t collstartpos = *inpos;
8398 Py_ssize_t collendpos = *inpos+1;
8399 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 char *encoding = "charmap";
8401 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008402 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008403 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008404 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405
Benjamin Petersonbac79492012-01-14 13:34:47 -05008406 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008407 return -1;
8408 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409 /* find all unencodable characters */
8410 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008411 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008412 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008413 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008414 val = encoding_map_lookup(ch, mapping);
8415 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 break;
8417 ++collendpos;
8418 continue;
8419 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008421 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8422 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 if (rep==NULL)
8424 return -1;
8425 else if (rep!=Py_None) {
8426 Py_DECREF(rep);
8427 break;
8428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008429 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 }
8432 /* cache callback name lookup
8433 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008434 if (*error_handler == _Py_ERROR_UNKNOWN)
8435 *error_handler = get_error_handler(errors);
8436
8437 switch (*error_handler) {
8438 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008439 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008441
8442 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 x = charmapencode_output('?', mapping, res, respos);
8445 if (x==enc_EXCEPTION) {
8446 return -1;
8447 }
8448 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008449 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return -1;
8451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 }
8453 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008454 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 *inpos = collendpos;
8456 break;
Victor Stinner50149202015-09-22 00:26:54 +02008457
8458 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 /* generate replacement (temporarily (mis)uses p) */
8460 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 char buffer[2+29+1+1];
8462 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008463 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 for (cp = buffer; *cp; ++cp) {
8465 x = charmapencode_output(*cp, mapping, res, respos);
8466 if (x==enc_EXCEPTION)
8467 return -1;
8468 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008469 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return -1;
8471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 }
8473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008474 *inpos = collendpos;
8475 break;
Victor Stinner50149202015-09-22 00:26:54 +02008476
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 default:
Victor Stinner50149202015-09-22 00:26:54 +02008478 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008479 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008481 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008483 if (PyBytes_Check(repunicode)) {
8484 /* Directly copy bytes result to output. */
8485 Py_ssize_t outsize = PyBytes_Size(*res);
8486 Py_ssize_t requiredsize;
8487 repsize = PyBytes_Size(repunicode);
8488 requiredsize = *respos + repsize;
8489 if (requiredsize > outsize)
8490 /* Make room for all additional bytes. */
8491 if (charmapencode_resize(res, respos, requiredsize)) {
8492 Py_DECREF(repunicode);
8493 return -1;
8494 }
8495 memcpy(PyBytes_AsString(*res) + *respos,
8496 PyBytes_AsString(repunicode), repsize);
8497 *respos += repsize;
8498 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008499 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008500 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008502 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008503 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008504 Py_DECREF(repunicode);
8505 return -1;
8506 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008507 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008508 data = PyUnicode_DATA(repunicode);
8509 kind = PyUnicode_KIND(repunicode);
8510 for (index = 0; index < repsize; index++) {
8511 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8512 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008514 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 return -1;
8516 }
8517 else if (x==enc_FAILED) {
8518 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008519 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 return -1;
8521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008522 }
8523 *inpos = newpos;
8524 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 }
8526 return 0;
8527}
8528
Alexander Belopolsky40018472011-02-26 01:02:56 +00008529PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008530_PyUnicode_EncodeCharmap(PyObject *unicode,
8531 PyObject *mapping,
8532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 /* output object */
8535 PyObject *res = NULL;
8536 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008537 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008541 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008543 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008544 void *data;
8545 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
Benjamin Petersonbac79492012-01-14 13:34:47 -05008547 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008548 return NULL;
8549 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008550 data = PyUnicode_DATA(unicode);
8551 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008552
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 /* Default to Latin-1 */
8554 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 /* allocate enough for a simple encoding without
8558 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008559 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 if (res == NULL)
8561 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008562 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008566 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008568 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 if (x==enc_EXCEPTION) /* error */
8570 goto onError;
8571 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008574 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 &res, &respos)) {
8576 goto onError;
8577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 else
8580 /* done with this character => adjust input position */
8581 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008585 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008586 if (_PyBytes_Resize(&res, respos) < 0)
8587 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008590 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008591 return res;
8592
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 Py_XDECREF(res);
8595 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008596 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 return NULL;
8598}
8599
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600/* Deprecated */
8601PyObject *
8602PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8603 Py_ssize_t size,
8604 PyObject *mapping,
8605 const char *errors)
8606{
8607 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008608 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008609 if (unicode == NULL)
8610 return NULL;
8611 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8612 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008613 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614}
8615
Alexander Belopolsky40018472011-02-26 01:02:56 +00008616PyObject *
8617PyUnicode_AsCharmapString(PyObject *unicode,
8618 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619{
8620 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 PyErr_BadArgument();
8622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008624 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625}
8626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008628static void
8629make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631 Py_ssize_t startpos, Py_ssize_t endpos,
8632 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 *exceptionObject = _PyUnicodeTranslateError_Create(
8636 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 }
8638 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8640 goto onError;
8641 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8642 goto onError;
8643 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8644 goto onError;
8645 return;
8646 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008647 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 }
8649}
8650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651/* error handling callback helper:
8652 build arguments, call the callback and check the arguments,
8653 put the result into newpos and return the replacement string, which
8654 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655static PyObject *
8656unicode_translate_call_errorhandler(const char *errors,
8657 PyObject **errorHandler,
8658 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660 Py_ssize_t startpos, Py_ssize_t endpos,
8661 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008663 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008665 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 PyObject *restuple;
8667 PyObject *resunicode;
8668
8669 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 }
8674
8675 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008680 restuple = PyObject_CallFunctionObjArgs(
8681 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008685 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 Py_DECREF(restuple);
8687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008689 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 &resunicode, &i_newpos)) {
8691 Py_DECREF(restuple);
8692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008696 else
8697 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008699 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 Py_DECREF(restuple);
8701 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703 Py_INCREF(resunicode);
8704 Py_DECREF(restuple);
8705 return resunicode;
8706}
8707
8708/* Lookup the character ch in the mapping and put the result in result,
8709 which must be decrefed by the caller.
8710 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008711static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713{
Christian Heimes217cfd12007-12-02 14:31:20 +00008714 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 PyObject *x;
8716
8717 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 x = PyObject_GetItem(mapping, w);
8720 Py_DECREF(w);
8721 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8723 /* No mapping found means: use 1:1 mapping. */
8724 PyErr_Clear();
8725 *result = NULL;
8726 return 0;
8727 } else
8728 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 }
8730 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 *result = x;
8732 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008734 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008736 if (value < 0 || value > MAX_UNICODE) {
8737 PyErr_Format(PyExc_ValueError,
8738 "character mapping must be in range(0x%x)",
8739 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 Py_DECREF(x);
8741 return -1;
8742 }
8743 *result = x;
8744 return 0;
8745 }
8746 else if (PyUnicode_Check(x)) {
8747 *result = x;
8748 return 0;
8749 }
8750 else {
8751 /* wrong return value */
8752 PyErr_SetString(PyExc_TypeError,
8753 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008754 Py_DECREF(x);
8755 return -1;
8756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757}
Victor Stinner1194ea02014-04-04 19:37:40 +02008758
8759/* lookup the character, write the result into the writer.
8760 Return 1 if the result was written into the writer, return 0 if the mapping
8761 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008762static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008763charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8764 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765{
Victor Stinner1194ea02014-04-04 19:37:40 +02008766 PyObject *item;
8767
8768 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008770
8771 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008773 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008776 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008778
8779 if (item == Py_None) {
8780 Py_DECREF(item);
8781 return 0;
8782 }
8783
8784 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008785 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8786 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8787 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008788 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8789 Py_DECREF(item);
8790 return -1;
8791 }
8792 Py_DECREF(item);
8793 return 1;
8794 }
8795
8796 if (!PyUnicode_Check(item)) {
8797 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008799 }
8800
8801 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8802 Py_DECREF(item);
8803 return -1;
8804 }
8805
8806 Py_DECREF(item);
8807 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808}
8809
Victor Stinner89a76ab2014-04-05 11:44:04 +02008810static int
8811unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8812 Py_UCS1 *translate)
8813{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008814 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 int ret = 0;
8816
Victor Stinner89a76ab2014-04-05 11:44:04 +02008817 if (charmaptranslate_lookup(ch, mapping, &item)) {
8818 return -1;
8819 }
8820
8821 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008822 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008823 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008826 /* not found => default to 1:1 mapping */
8827 translate[ch] = ch;
8828 return 1;
8829 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008830 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008831 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008832 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8833 used it */
8834 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008835 /* invalid character or character outside ASCII:
8836 skip the fast translate */
8837 goto exit;
8838 }
8839 translate[ch] = (Py_UCS1)replace;
8840 }
8841 else if (PyUnicode_Check(item)) {
8842 Py_UCS4 replace;
8843
8844 if (PyUnicode_READY(item) == -1) {
8845 Py_DECREF(item);
8846 return -1;
8847 }
8848 if (PyUnicode_GET_LENGTH(item) != 1)
8849 goto exit;
8850
8851 replace = PyUnicode_READ_CHAR(item, 0);
8852 if (replace > 127)
8853 goto exit;
8854 translate[ch] = (Py_UCS1)replace;
8855 }
8856 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008857 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 goto exit;
8859 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 ret = 1;
8861
Benjamin Peterson1365de72014-04-07 20:15:41 -04008862 exit:
8863 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864 return ret;
8865}
8866
8867/* Fast path for ascii => ascii translation. Return 1 if the whole string
8868 was translated into writer, return 0 if the input string was partially
8869 translated into writer, raise an exception and return -1 on error. */
8870static int
8871unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008872 _PyUnicodeWriter *writer, int ignore,
8873 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874{
Victor Stinner872b2912014-04-05 14:27:07 +02008875 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008876 Py_ssize_t len;
8877 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008878 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880 len = PyUnicode_GET_LENGTH(input);
8881
Victor Stinner872b2912014-04-05 14:27:07 +02008882 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883
8884 in = PyUnicode_1BYTE_DATA(input);
8885 end = in + len;
8886
8887 assert(PyUnicode_IS_ASCII(writer->buffer));
8888 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8889 out = PyUnicode_1BYTE_DATA(writer->buffer);
8890
Victor Stinner872b2912014-04-05 14:27:07 +02008891 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008893 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008895 int translate = unicode_fast_translate_lookup(mapping, ch,
8896 ascii_table);
8897 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008899 if (translate == 0)
8900 goto exit;
8901 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 }
Victor Stinner872b2912014-04-05 14:27:07 +02008903 if (ch2 == 0xfe) {
8904 if (ignore)
8905 continue;
8906 goto exit;
8907 }
8908 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008910 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911 }
Victor Stinner872b2912014-04-05 14:27:07 +02008912 res = 1;
8913
8914exit:
8915 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008916 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008917 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918}
8919
Victor Stinner3222da22015-10-01 22:07:32 +02008920static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921_PyUnicode_TranslateCharmap(PyObject *input,
8922 PyObject *mapping,
8923 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 Py_ssize_t size, i;
8928 int kind;
8929 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 _PyUnicodeWriter writer;
8931 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008932 char *reason = "character maps to <undefined>";
8933 PyObject *errorHandler = NULL;
8934 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008935 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 PyErr_BadArgument();
8940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 if (PyUnicode_READY(input) == -1)
8944 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008945 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 kind = PyUnicode_KIND(input);
8947 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008949 if (size == 0)
8950 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008952 /* allocate enough for a simple 1:1 translation without
8953 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008954 _PyUnicodeWriter_Init(&writer);
8955 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957
Victor Stinner872b2912014-04-05 14:27:07 +02008958 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8959
Victor Stinner33798672016-03-01 21:59:58 +01008960 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008961 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008962 if (PyUnicode_IS_ASCII(input)) {
8963 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8964 if (res < 0) {
8965 _PyUnicodeWriter_Dealloc(&writer);
8966 return NULL;
8967 }
8968 if (res == 1)
8969 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970 }
Victor Stinner33798672016-03-01 21:59:58 +01008971 else {
8972 i = 0;
8973 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008977 int translate;
8978 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8979 Py_ssize_t newpos;
8980 /* startpos for collecting untranslatable chars */
8981 Py_ssize_t collstart;
8982 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
Victor Stinner1194ea02014-04-04 19:37:40 +02008985 ch = PyUnicode_READ(kind, data, i);
8986 translate = charmaptranslate_output(ch, mapping, &writer);
8987 if (translate < 0)
8988 goto onError;
8989
8990 if (translate != 0) {
8991 /* it worked => adjust input pointer */
8992 ++i;
8993 continue;
8994 }
8995
8996 /* untranslatable character */
8997 collstart = i;
8998 collend = i+1;
8999
9000 /* find all untranslatable characters */
9001 while (collend < size) {
9002 PyObject *x;
9003 ch = PyUnicode_READ(kind, data, collend);
9004 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009005 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009006 Py_XDECREF(x);
9007 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 ++collend;
9010 }
9011
9012 if (ignore) {
9013 i = collend;
9014 }
9015 else {
9016 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9017 reason, input, &exc,
9018 collstart, collend, &newpos);
9019 if (repunicode == NULL)
9020 goto onError;
9021 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009023 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009024 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009025 Py_DECREF(repunicode);
9026 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009027 }
9028 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009029 Py_XDECREF(exc);
9030 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009035 Py_XDECREF(exc);
9036 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 return NULL;
9038}
9039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040/* Deprecated. Use PyUnicode_Translate instead. */
9041PyObject *
9042PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9043 Py_ssize_t size,
9044 PyObject *mapping,
9045 const char *errors)
9046{
Christian Heimes5f520f42012-09-11 14:03:25 +02009047 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009048 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 if (!unicode)
9050 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009051 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9052 Py_DECREF(unicode);
9053 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054}
9055
Alexander Belopolsky40018472011-02-26 01:02:56 +00009056PyObject *
9057PyUnicode_Translate(PyObject *str,
9058 PyObject *mapping,
9059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009061 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009062 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009063 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064}
Tim Petersced69f82003-09-16 20:30:58 +00009065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009067fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068{
9069 /* No need to call PyUnicode_READY(self) because this function is only
9070 called as a callback from fixup() which does it already. */
9071 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9072 const int kind = PyUnicode_KIND(self);
9073 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009074 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009075 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 Py_ssize_t i;
9077
9078 for (i = 0; i < len; ++i) {
9079 ch = PyUnicode_READ(kind, data, i);
9080 fixed = 0;
9081 if (ch > 127) {
9082 if (Py_UNICODE_ISSPACE(ch))
9083 fixed = ' ';
9084 else {
9085 const int decimal = Py_UNICODE_TODECIMAL(ch);
9086 if (decimal >= 0)
9087 fixed = '0' + decimal;
9088 }
9089 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009090 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009091 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 PyUnicode_WRITE(kind, data, i, fixed);
9093 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009094 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009095 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 }
9098
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009099 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100}
9101
9102PyObject *
9103_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9104{
9105 if (!PyUnicode_Check(unicode)) {
9106 PyErr_BadInternalCall();
9107 return NULL;
9108 }
9109 if (PyUnicode_READY(unicode) == -1)
9110 return NULL;
9111 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9112 /* If the string is already ASCII, just return the same string */
9113 Py_INCREF(unicode);
9114 return unicode;
9115 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009116 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117}
9118
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009119PyObject *
9120PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9121 Py_ssize_t length)
9122{
Victor Stinnerf0124502011-11-21 23:12:56 +01009123 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009124 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009125 Py_UCS4 maxchar;
9126 enum PyUnicode_Kind kind;
9127 void *data;
9128
Victor Stinner99d7ad02012-02-22 13:37:39 +01009129 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009130 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009131 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009132 if (ch > 127) {
9133 int decimal = Py_UNICODE_TODECIMAL(ch);
9134 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009135 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009136 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009137 }
9138 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009139
9140 /* Copy to a new string */
9141 decimal = PyUnicode_New(length, maxchar);
9142 if (decimal == NULL)
9143 return decimal;
9144 kind = PyUnicode_KIND(decimal);
9145 data = PyUnicode_DATA(decimal);
9146 /* Iterate over code points */
9147 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009148 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009149 if (ch > 127) {
9150 int decimal = Py_UNICODE_TODECIMAL(ch);
9151 if (decimal >= 0)
9152 ch = '0' + decimal;
9153 }
9154 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009156 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009157}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009158/* --- Decimal Encoder ---------------------------------------------------- */
9159
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160int
9161PyUnicode_EncodeDecimal(Py_UNICODE *s,
9162 Py_ssize_t length,
9163 char *output,
9164 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009165{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009166 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009167 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009168 enum PyUnicode_Kind kind;
9169 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009170
9171 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 PyErr_BadArgument();
9173 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174 }
9175
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009176 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 if (unicode == NULL)
9178 return -1;
9179
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 kind = PyUnicode_KIND(unicode);
9181 data = PyUnicode_DATA(unicode);
9182
Victor Stinnerb84d7232011-11-22 01:50:07 +01009183 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009184 PyObject *exc;
9185 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009187 Py_ssize_t startpos;
9188
9189 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009190
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009193 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 decimal = Py_UNICODE_TODECIMAL(ch);
9197 if (decimal >= 0) {
9198 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009199 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 continue;
9201 }
9202 if (0 < ch && ch < 256) {
9203 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009204 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 continue;
9206 }
Victor Stinner6345be92011-11-25 20:09:01 +01009207
Victor Stinner42bf7752011-11-21 22:52:58 +01009208 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009209 exc = NULL;
9210 raise_encode_exception(&exc, "decimal", unicode,
9211 startpos, startpos+1,
9212 "invalid decimal Unicode string");
9213 Py_XDECREF(exc);
9214 Py_DECREF(unicode);
9215 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009216 }
9217 /* 0-terminate the output string */
9218 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009219 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009221}
9222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223/* --- Helpers ------------------------------------------------------------ */
9224
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009225/* helper macro to fixup start/end slice values */
9226#define ADJUST_INDICES(start, end, len) \
9227 if (end > len) \
9228 end = len; \
9229 else if (end < 0) { \
9230 end += len; \
9231 if (end < 0) \
9232 end = 0; \
9233 } \
9234 if (start < 0) { \
9235 start += len; \
9236 if (start < 0) \
9237 start = 0; \
9238 }
9239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009241any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009243 Py_ssize_t end,
9244 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009246 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 void *buf1, *buf2;
9248 Py_ssize_t len1, len2, result;
9249
9250 kind1 = PyUnicode_KIND(s1);
9251 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009252 if (kind1 < kind2)
9253 return -1;
9254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 len1 = PyUnicode_GET_LENGTH(s1);
9256 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257 ADJUST_INDICES(start, end, len1);
9258 if (end - start < len2)
9259 return -1;
9260
9261 buf1 = PyUnicode_DATA(s1);
9262 buf2 = PyUnicode_DATA(s2);
9263 if (len2 == 1) {
9264 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9265 result = findchar((const char *)buf1 + kind1*start,
9266 kind1, end - start, ch, direction);
9267 if (result == -1)
9268 return -1;
9269 else
9270 return start + result;
9271 }
9272
9273 if (kind2 != kind1) {
9274 buf2 = _PyUnicode_AsKind(s2, kind1);
9275 if (!buf2)
9276 return -2;
9277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278
Victor Stinner794d5672011-10-10 03:21:36 +02009279 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009281 case PyUnicode_1BYTE_KIND:
9282 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9283 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9284 else
9285 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9286 break;
9287 case PyUnicode_2BYTE_KIND:
9288 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9289 break;
9290 case PyUnicode_4BYTE_KIND:
9291 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9292 break;
9293 default:
9294 assert(0); result = -2;
9295 }
9296 }
9297 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009298 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009299 case PyUnicode_1BYTE_KIND:
9300 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9301 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9302 else
9303 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 break;
9305 case PyUnicode_2BYTE_KIND:
9306 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 break;
9308 case PyUnicode_4BYTE_KIND:
9309 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 default:
9312 assert(0); result = -2;
9313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 }
9315
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009316 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 PyMem_Free(buf2);
9318
9319 return result;
9320}
9321
9322Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009323_PyUnicode_InsertThousandsGrouping(
9324 PyObject *unicode, Py_ssize_t index,
9325 Py_ssize_t n_buffer,
9326 void *digits, Py_ssize_t n_digits,
9327 Py_ssize_t min_width,
9328 const char *grouping, PyObject *thousands_sep,
9329 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330{
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009332 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009333 Py_ssize_t thousands_sep_len;
9334 Py_ssize_t len;
9335
9336 if (unicode != NULL) {
9337 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009338 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009339 }
9340 else {
9341 kind = PyUnicode_1BYTE_KIND;
9342 data = NULL;
9343 }
9344 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9345 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9346 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9347 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009348 if (thousands_sep_kind < kind) {
9349 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9350 if (!thousands_sep_data)
9351 return -1;
9352 }
9353 else {
9354 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9355 if (!data)
9356 return -1;
9357 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009358 }
9359
Benjamin Petersonead6b532011-12-20 17:23:42 -06009360 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009362 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009363 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009364 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009366 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009367 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009368 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009369 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009370 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009371 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009372 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009374 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009375 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009376 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009377 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009378 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009380 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009381 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009383 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009384 break;
9385 default:
9386 assert(0);
9387 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009389 if (unicode != NULL && thousands_sep_kind != kind) {
9390 if (thousands_sep_kind < kind)
9391 PyMem_Free(thousands_sep_data);
9392 else
9393 PyMem_Free(data);
9394 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009395 if (unicode == NULL) {
9396 *maxchar = 127;
9397 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009398 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009399 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009400 }
9401 }
9402 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403}
9404
9405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406Py_ssize_t
9407PyUnicode_Count(PyObject *str,
9408 PyObject *substr,
9409 Py_ssize_t start,
9410 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009412 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 void *buf1 = NULL, *buf2 = NULL;
9415 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009416
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009417 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009420 kind1 = PyUnicode_KIND(str);
9421 kind2 = PyUnicode_KIND(substr);
9422 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009423 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009425 len1 = PyUnicode_GET_LENGTH(str);
9426 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009430
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 buf1 = PyUnicode_DATA(str);
9432 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009435 if (!buf2)
9436 goto onError;
9437 }
9438
9439 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009442 result = asciilib_count(
9443 ((Py_UCS1*)buf1) + start, end - start,
9444 buf2, len2, PY_SSIZE_T_MAX
9445 );
9446 else
9447 result = ucs1lib_count(
9448 ((Py_UCS1*)buf1) + start, end - start,
9449 buf2, len2, PY_SSIZE_T_MAX
9450 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 break;
9452 case PyUnicode_2BYTE_KIND:
9453 result = ucs2lib_count(
9454 ((Py_UCS2*)buf1) + start, end - start,
9455 buf2, len2, PY_SSIZE_T_MAX
9456 );
9457 break;
9458 case PyUnicode_4BYTE_KIND:
9459 result = ucs4lib_count(
9460 ((Py_UCS4*)buf1) + start, end - start,
9461 buf2, len2, PY_SSIZE_T_MAX
9462 );
9463 break;
9464 default:
9465 assert(0); result = 0;
9466 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009467
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009468 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 PyMem_Free(buf2);
9470
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009473 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 PyMem_Free(buf2);
9475 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Alexander Belopolsky40018472011-02-26 01:02:56 +00009478Py_ssize_t
9479PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009480 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009481 Py_ssize_t start,
9482 Py_ssize_t end,
9483 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009485 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009487
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009488 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489}
9490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491Py_ssize_t
9492PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9493 Py_ssize_t start, Py_ssize_t end,
9494 int direction)
9495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009497 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (PyUnicode_READY(str) == -1)
9499 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009500 len = PyUnicode_GET_LENGTH(str);
9501 ADJUST_INDICES(start, end, len);
9502 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009503 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009505 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9506 kind, end-start, ch, direction);
9507 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009509 else
9510 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511}
9512
Alexander Belopolsky40018472011-02-26 01:02:56 +00009513static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009514tailmatch(PyObject *self,
9515 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516 Py_ssize_t start,
9517 Py_ssize_t end,
9518 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 int kind_self;
9521 int kind_sub;
9522 void *data_self;
9523 void *data_sub;
9524 Py_ssize_t offset;
9525 Py_ssize_t i;
9526 Py_ssize_t end_sub;
9527
9528 if (PyUnicode_READY(self) == -1 ||
9529 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009530 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9533 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009537 if (PyUnicode_GET_LENGTH(substring) == 0)
9538 return 1;
9539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 kind_self = PyUnicode_KIND(self);
9541 data_self = PyUnicode_DATA(self);
9542 kind_sub = PyUnicode_KIND(substring);
9543 data_sub = PyUnicode_DATA(substring);
9544 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9545
9546 if (direction > 0)
9547 offset = end;
9548 else
9549 offset = start;
9550
9551 if (PyUnicode_READ(kind_self, data_self, offset) ==
9552 PyUnicode_READ(kind_sub, data_sub, 0) &&
9553 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9554 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9555 /* If both are of the same kind, memcmp is sufficient */
9556 if (kind_self == kind_sub) {
9557 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009558 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 data_sub,
9560 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009561 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009563 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 else {
9565 /* We do not need to compare 0 and len(substring)-1 because
9566 the if statement above ensured already that they are equal
9567 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 for (i = 1; i < end_sub; ++i) {
9569 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9570 PyUnicode_READ(kind_sub, data_sub, i))
9571 return 0;
9572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575 }
9576
9577 return 0;
9578}
9579
Alexander Belopolsky40018472011-02-26 01:02:56 +00009580Py_ssize_t
9581PyUnicode_Tailmatch(PyObject *str,
9582 PyObject *substr,
9583 Py_ssize_t start,
9584 Py_ssize_t end,
9585 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009587 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009589
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591}
9592
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593/* Apply fixfct filter to the Unicode object self and return a
9594 reference to the modified object */
9595
Alexander Belopolsky40018472011-02-26 01:02:56 +00009596static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009597fixup(PyObject *self,
9598 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 PyObject *u;
9601 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009602 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009604 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009607 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 /* fix functions return the new maximum character in a string,
9610 if the kind of the resulting unicode object does not change,
9611 everything is fine. Otherwise we need to change the string kind
9612 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009613 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009614
9615 if (maxchar_new == 0) {
9616 /* no changes */;
9617 if (PyUnicode_CheckExact(self)) {
9618 Py_DECREF(u);
9619 Py_INCREF(self);
9620 return self;
9621 }
9622 else
9623 return u;
9624 }
9625
Victor Stinnere6abb482012-05-02 01:15:40 +02009626 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627
Victor Stinnereaab6042011-12-11 22:22:39 +01009628 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009630
9631 /* In case the maximum character changed, we need to
9632 convert the string to the new category. */
9633 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9634 if (v == NULL) {
9635 Py_DECREF(u);
9636 return NULL;
9637 }
9638 if (maxchar_new > maxchar_old) {
9639 /* If the maxchar increased so that the kind changed, not all
9640 characters are representable anymore and we need to fix the
9641 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009642 _PyUnicode_FastCopyCharacters(v, 0,
9643 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009644 maxchar_old = fixfct(v);
9645 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 }
9647 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009648 _PyUnicode_FastCopyCharacters(v, 0,
9649 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009651 Py_DECREF(u);
9652 assert(_PyUnicode_CheckConsistency(v, 1));
9653 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654}
9655
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656static PyObject *
9657ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9660 char *resdata, *data = PyUnicode_DATA(self);
9661 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009662
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 res = PyUnicode_New(len, 127);
9664 if (res == NULL)
9665 return NULL;
9666 resdata = PyUnicode_DATA(res);
9667 if (lower)
9668 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009670 _Py_bytes_upper(resdata, data, len);
9671 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672}
9673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009677 Py_ssize_t j;
9678 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009679 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009681
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9683
9684 where ! is a negation and \p{xxx} is a character with property xxx.
9685 */
9686 for (j = i - 1; j >= 0; j--) {
9687 c = PyUnicode_READ(kind, data, j);
9688 if (!_PyUnicode_IsCaseIgnorable(c))
9689 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9692 if (final_sigma) {
9693 for (j = i + 1; j < length; j++) {
9694 c = PyUnicode_READ(kind, data, j);
9695 if (!_PyUnicode_IsCaseIgnorable(c))
9696 break;
9697 }
9698 final_sigma = j == length || !_PyUnicode_IsCased(c);
9699 }
9700 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701}
9702
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703static int
9704lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9705 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 /* Obscure special case. */
9708 if (c == 0x3A3) {
9709 mapped[0] = handle_capital_sigma(kind, data, length, i);
9710 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713}
9714
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715static Py_ssize_t
9716do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 Py_ssize_t i, k = 0;
9719 int n_res, j;
9720 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009721
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 c = PyUnicode_READ(kind, data, 0);
9723 n_res = _PyUnicode_ToUpperFull(c, mapped);
9724 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009725 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009728 for (i = 1; i < length; i++) {
9729 c = PyUnicode_READ(kind, data, i);
9730 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9731 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009732 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009733 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009734 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009735 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737}
9738
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739static Py_ssize_t
9740do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9741 Py_ssize_t i, k = 0;
9742
9743 for (i = 0; i < length; i++) {
9744 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9745 int n_res, j;
9746 if (Py_UNICODE_ISUPPER(c)) {
9747 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9748 }
9749 else if (Py_UNICODE_ISLOWER(c)) {
9750 n_res = _PyUnicode_ToUpperFull(c, mapped);
9751 }
9752 else {
9753 n_res = 1;
9754 mapped[0] = c;
9755 }
9756 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009757 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009758 res[k++] = mapped[j];
9759 }
9760 }
9761 return k;
9762}
9763
9764static Py_ssize_t
9765do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9766 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768 Py_ssize_t i, k = 0;
9769
9770 for (i = 0; i < length; i++) {
9771 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9772 int n_res, j;
9773 if (lower)
9774 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9775 else
9776 n_res = _PyUnicode_ToUpperFull(c, mapped);
9777 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009778 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009779 res[k++] = mapped[j];
9780 }
9781 }
9782 return k;
9783}
9784
9785static Py_ssize_t
9786do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9787{
9788 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9789}
9790
9791static Py_ssize_t
9792do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9793{
9794 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9795}
9796
Benjamin Petersone51757f2012-01-12 21:10:29 -05009797static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009798do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9799{
9800 Py_ssize_t i, k = 0;
9801
9802 for (i = 0; i < length; i++) {
9803 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9804 Py_UCS4 mapped[3];
9805 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9806 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009807 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009808 res[k++] = mapped[j];
9809 }
9810 }
9811 return k;
9812}
9813
9814static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009815do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9816{
9817 Py_ssize_t i, k = 0;
9818 int previous_is_cased;
9819
9820 previous_is_cased = 0;
9821 for (i = 0; i < length; i++) {
9822 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9823 Py_UCS4 mapped[3];
9824 int n_res, j;
9825
9826 if (previous_is_cased)
9827 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9828 else
9829 n_res = _PyUnicode_ToTitleFull(c, mapped);
9830
9831 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009832 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009833 res[k++] = mapped[j];
9834 }
9835
9836 previous_is_cased = _PyUnicode_IsCased(c);
9837 }
9838 return k;
9839}
9840
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009841static PyObject *
9842case_operation(PyObject *self,
9843 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9844{
9845 PyObject *res = NULL;
9846 Py_ssize_t length, newlength = 0;
9847 int kind, outkind;
9848 void *data, *outdata;
9849 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9850
Benjamin Petersoneea48462012-01-16 14:28:50 -05009851 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009852
9853 kind = PyUnicode_KIND(self);
9854 data = PyUnicode_DATA(self);
9855 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009856 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009857 PyErr_SetString(PyExc_OverflowError, "string is too long");
9858 return NULL;
9859 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009860 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 if (tmp == NULL)
9862 return PyErr_NoMemory();
9863 newlength = perform(kind, data, length, tmp, &maxchar);
9864 res = PyUnicode_New(newlength, maxchar);
9865 if (res == NULL)
9866 goto leave;
9867 tmpend = tmp + newlength;
9868 outdata = PyUnicode_DATA(res);
9869 outkind = PyUnicode_KIND(res);
9870 switch (outkind) {
9871 case PyUnicode_1BYTE_KIND:
9872 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9873 break;
9874 case PyUnicode_2BYTE_KIND:
9875 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9879 break;
9880 default:
9881 assert(0);
9882 break;
9883 }
9884 leave:
9885 PyMem_FREE(tmp);
9886 return res;
9887}
9888
Tim Peters8ce9f162004-08-27 01:49:32 +00009889PyObject *
9890PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009892 PyObject *res;
9893 PyObject *fseq;
9894 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009895 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009897 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009898 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009900 }
9901
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009902 /* NOTE: the following code can't call back into Python code,
9903 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009904 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009905
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009906 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009908 res = _PyUnicode_JoinArray(separator, items, seqlen);
9909 Py_DECREF(fseq);
9910 return res;
9911}
9912
9913PyObject *
9914_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9915{
9916 PyObject *res = NULL; /* the result */
9917 PyObject *sep = NULL;
9918 Py_ssize_t seplen;
9919 PyObject *item;
9920 Py_ssize_t sz, i, res_offset;
9921 Py_UCS4 maxchar;
9922 Py_UCS4 item_maxchar;
9923 int use_memcpy;
9924 unsigned char *res_data = NULL, *sep_data = NULL;
9925 PyObject *last_obj;
9926 unsigned int kind = 0;
9927
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 /* If empty sequence, return u"". */
9929 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009930 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009932
Tim Peters05eba1f2004-08-27 21:32:02 +00009933 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009934 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 if (seqlen == 1) {
9936 if (PyUnicode_CheckExact(items[0])) {
9937 res = items[0];
9938 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 return res;
9940 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009941 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009942 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009943 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009944 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009945 /* Set up sep and seplen */
9946 if (separator == NULL) {
9947 /* fall back to a blank space separator */
9948 sep = PyUnicode_FromOrdinal(' ');
9949 if (!sep)
9950 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009951 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009952 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009954 else {
9955 if (!PyUnicode_Check(separator)) {
9956 PyErr_Format(PyExc_TypeError,
9957 "separator: expected str instance,"
9958 " %.80s found",
9959 Py_TYPE(separator)->tp_name);
9960 goto onError;
9961 }
9962 if (PyUnicode_READY(separator))
9963 goto onError;
9964 sep = separator;
9965 seplen = PyUnicode_GET_LENGTH(separator);
9966 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9967 /* inc refcount to keep this code path symmetric with the
9968 above case of a blank separator */
9969 Py_INCREF(sep);
9970 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009972 }
9973
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009974 /* There are at least two things to join, or else we have a subclass
9975 * of str in the sequence.
9976 * Do a pre-pass to figure out the total amount of space we'll
9977 * need (sz), and see whether all argument are strings.
9978 */
9979 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009980#ifdef Py_DEBUG
9981 use_memcpy = 0;
9982#else
9983 use_memcpy = 1;
9984#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009986 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 if (!PyUnicode_Check(item)) {
9989 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009990 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009991 " %.80s found",
9992 i, Py_TYPE(item)->tp_name);
9993 goto onError;
9994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 if (PyUnicode_READY(item) == -1)
9996 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009997 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009999 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010000 if (i != 0) {
10001 add_sz += seplen;
10002 }
10003 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 goto onError;
10007 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010008 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010009 if (use_memcpy && last_obj != NULL) {
10010 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10011 use_memcpy = 0;
10012 }
10013 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 }
Tim Petersced69f82003-09-16 20:30:58 +000010015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 if (res == NULL)
10018 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010019
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010021#ifdef Py_DEBUG
10022 use_memcpy = 0;
10023#else
10024 if (use_memcpy) {
10025 res_data = PyUnicode_1BYTE_DATA(res);
10026 kind = PyUnicode_KIND(res);
10027 if (seplen != 0)
10028 sep_data = PyUnicode_1BYTE_DATA(sep);
10029 }
10030#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010031 if (use_memcpy) {
10032 for (i = 0; i < seqlen; ++i) {
10033 Py_ssize_t itemlen;
10034 item = items[i];
10035
10036 /* Copy item, and maybe the separator. */
10037 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010038 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010040 kind * seplen);
10041 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010043
10044 itemlen = PyUnicode_GET_LENGTH(item);
10045 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010046 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010048 kind * itemlen);
10049 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010051 }
10052 assert(res_data == PyUnicode_1BYTE_DATA(res)
10053 + kind * PyUnicode_GET_LENGTH(res));
10054 }
10055 else {
10056 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10057 Py_ssize_t itemlen;
10058 item = items[i];
10059
10060 /* Copy item, and maybe the separator. */
10061 if (i && seplen != 0) {
10062 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10063 res_offset += seplen;
10064 }
10065
10066 itemlen = PyUnicode_GET_LENGTH(item);
10067 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010068 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 res_offset += itemlen;
10070 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010071 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010073 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010076 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010081 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 return NULL;
10083}
10084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085#define FILL(kind, data, value, start, length) \
10086 do { \
10087 Py_ssize_t i_ = 0; \
10088 assert(kind != PyUnicode_WCHAR_KIND); \
10089 switch ((kind)) { \
10090 case PyUnicode_1BYTE_KIND: { \
10091 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010092 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break; \
10094 } \
10095 case PyUnicode_2BYTE_KIND: { \
10096 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10097 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10098 break; \
10099 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010100 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10102 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10103 break; \
10104 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010105 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 } \
10107 } while (0)
10108
Victor Stinnerd3f08822012-05-29 12:57:52 +020010109void
10110_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10111 Py_UCS4 fill_char)
10112{
10113 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10114 const void *data = PyUnicode_DATA(unicode);
10115 assert(PyUnicode_IS_READY(unicode));
10116 assert(unicode_modifiable(unicode));
10117 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10118 assert(start >= 0);
10119 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10120 FILL(kind, data, fill_char, start, length);
10121}
10122
Victor Stinner3fe55312012-01-04 00:33:50 +010010123Py_ssize_t
10124PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10125 Py_UCS4 fill_char)
10126{
10127 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010128
10129 if (!PyUnicode_Check(unicode)) {
10130 PyErr_BadInternalCall();
10131 return -1;
10132 }
10133 if (PyUnicode_READY(unicode) == -1)
10134 return -1;
10135 if (unicode_check_modifiable(unicode))
10136 return -1;
10137
Victor Stinnerd3f08822012-05-29 12:57:52 +020010138 if (start < 0) {
10139 PyErr_SetString(PyExc_IndexError, "string index out of range");
10140 return -1;
10141 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010142 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10143 PyErr_SetString(PyExc_ValueError,
10144 "fill character is bigger than "
10145 "the string maximum character");
10146 return -1;
10147 }
10148
10149 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10150 length = Py_MIN(maxlen, length);
10151 if (length <= 0)
10152 return 0;
10153
Victor Stinnerd3f08822012-05-29 12:57:52 +020010154 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010155 return length;
10156}
10157
Victor Stinner9310abb2011-10-05 00:59:23 +020010158static PyObject *
10159pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010160 Py_ssize_t left,
10161 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 PyObject *u;
10165 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010166 int kind;
10167 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168
10169 if (left < 0)
10170 left = 0;
10171 if (right < 0)
10172 right = 0;
10173
Victor Stinnerc4b49542011-12-11 22:44:26 +010010174 if (left == 0 && right == 0)
10175 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10178 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010179 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10180 return NULL;
10181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010183 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010185 if (!u)
10186 return NULL;
10187
10188 kind = PyUnicode_KIND(u);
10189 data = PyUnicode_DATA(u);
10190 if (left)
10191 FILL(kind, data, fill, 0, left);
10192 if (right)
10193 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010194 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010195 assert(_PyUnicode_CheckConsistency(u, 1));
10196 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197}
10198
Alexander Belopolsky40018472011-02-26 01:02:56 +000010199PyObject *
10200PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010204 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206
Benjamin Petersonead6b532011-12-20 17:23:42 -060010207 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 if (PyUnicode_IS_ASCII(string))
10210 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010211 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 PyUnicode_GET_LENGTH(string), keepends);
10213 else
10214 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 break;
10218 case PyUnicode_2BYTE_KIND:
10219 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010220 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 PyUnicode_GET_LENGTH(string), keepends);
10222 break;
10223 case PyUnicode_4BYTE_KIND:
10224 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010225 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 PyUnicode_GET_LENGTH(string), keepends);
10227 break;
10228 default:
10229 assert(0);
10230 list = 0;
10231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233}
10234
Alexander Belopolsky40018472011-02-26 01:02:56 +000010235static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010236split(PyObject *self,
10237 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010238 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010240 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 void *buf1, *buf2;
10242 Py_ssize_t len1, len2;
10243 PyObject* out;
10244
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010246 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (PyUnicode_READY(self) == -1)
10249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010252 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 if (PyUnicode_IS_ASCII(self))
10255 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010256 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 PyUnicode_GET_LENGTH(self), maxcount
10258 );
10259 else
10260 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010261 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010262 PyUnicode_GET_LENGTH(self), maxcount
10263 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 case PyUnicode_2BYTE_KIND:
10265 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 PyUnicode_GET_LENGTH(self), maxcount
10268 );
10269 case PyUnicode_4BYTE_KIND:
10270 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010271 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 PyUnicode_GET_LENGTH(self), maxcount
10273 );
10274 default:
10275 assert(0);
10276 return NULL;
10277 }
10278
10279 if (PyUnicode_READY(substring) == -1)
10280 return NULL;
10281
10282 kind1 = PyUnicode_KIND(self);
10283 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 len1 = PyUnicode_GET_LENGTH(self);
10285 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010286 if (kind1 < kind2 || len1 < len2) {
10287 out = PyList_New(1);
10288 if (out == NULL)
10289 return NULL;
10290 Py_INCREF(self);
10291 PyList_SET_ITEM(out, 0, self);
10292 return out;
10293 }
10294 buf1 = PyUnicode_DATA(self);
10295 buf2 = PyUnicode_DATA(substring);
10296 if (kind2 != kind1) {
10297 buf2 = _PyUnicode_AsKind(substring, kind1);
10298 if (!buf2)
10299 return NULL;
10300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010302 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010304 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10305 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010306 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 else
10308 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 case PyUnicode_2BYTE_KIND:
10312 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 break;
10315 case PyUnicode_4BYTE_KIND:
10316 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010317 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 break;
10319 default:
10320 out = NULL;
10321 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010322 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 PyMem_Free(buf2);
10324 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325}
10326
Alexander Belopolsky40018472011-02-26 01:02:56 +000010327static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010328rsplit(PyObject *self,
10329 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010330 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010331{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010332 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 void *buf1, *buf2;
10334 Py_ssize_t len1, len2;
10335 PyObject* out;
10336
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010338 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 if (PyUnicode_READY(self) == -1)
10341 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010344 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 if (PyUnicode_IS_ASCII(self))
10347 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(self), maxcount
10350 );
10351 else
10352 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 case PyUnicode_2BYTE_KIND:
10357 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 case PyUnicode_4BYTE_KIND:
10362 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 PyUnicode_GET_LENGTH(self), maxcount
10365 );
10366 default:
10367 assert(0);
10368 return NULL;
10369 }
10370
10371 if (PyUnicode_READY(substring) == -1)
10372 return NULL;
10373
10374 kind1 = PyUnicode_KIND(self);
10375 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 len1 = PyUnicode_GET_LENGTH(self);
10377 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010378 if (kind1 < kind2 || len1 < len2) {
10379 out = PyList_New(1);
10380 if (out == NULL)
10381 return NULL;
10382 Py_INCREF(self);
10383 PyList_SET_ITEM(out, 0, self);
10384 return out;
10385 }
10386 buf1 = PyUnicode_DATA(self);
10387 buf2 = PyUnicode_DATA(substring);
10388 if (kind2 != kind1) {
10389 buf2 = _PyUnicode_AsKind(substring, kind1);
10390 if (!buf2)
10391 return NULL;
10392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010394 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010396 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10397 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010398 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 else
10400 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010401 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 break;
10403 case PyUnicode_2BYTE_KIND:
10404 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010405 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 break;
10407 case PyUnicode_4BYTE_KIND:
10408 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010409 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 break;
10411 default:
10412 out = NULL;
10413 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010414 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 PyMem_Free(buf2);
10416 return out;
10417}
10418
10419static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10421 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010423 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010425 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10426 return asciilib_find(buf1, len1, buf2, len2, offset);
10427 else
10428 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 case PyUnicode_2BYTE_KIND:
10430 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10431 case PyUnicode_4BYTE_KIND:
10432 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10433 }
10434 assert(0);
10435 return -1;
10436}
10437
10438static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010439anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10440 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010442 switch (kind) {
10443 case PyUnicode_1BYTE_KIND:
10444 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10445 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10446 else
10447 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10448 case PyUnicode_2BYTE_KIND:
10449 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10450 case PyUnicode_4BYTE_KIND:
10451 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10452 }
10453 assert(0);
10454 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010455}
10456
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010457static void
10458replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10459 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10460{
10461 int kind = PyUnicode_KIND(u);
10462 void *data = PyUnicode_DATA(u);
10463 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10464 if (kind == PyUnicode_1BYTE_KIND) {
10465 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10466 (Py_UCS1 *)data + len,
10467 u1, u2, maxcount);
10468 }
10469 else if (kind == PyUnicode_2BYTE_KIND) {
10470 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10471 (Py_UCS2 *)data + len,
10472 u1, u2, maxcount);
10473 }
10474 else {
10475 assert(kind == PyUnicode_4BYTE_KIND);
10476 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10477 (Py_UCS4 *)data + len,
10478 u1, u2, maxcount);
10479 }
10480}
10481
Alexander Belopolsky40018472011-02-26 01:02:56 +000010482static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483replace(PyObject *self, PyObject *str1,
10484 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 PyObject *u;
10487 char *sbuf = PyUnicode_DATA(self);
10488 char *buf1 = PyUnicode_DATA(str1);
10489 char *buf2 = PyUnicode_DATA(str2);
10490 int srelease = 0, release1 = 0, release2 = 0;
10491 int skind = PyUnicode_KIND(self);
10492 int kind1 = PyUnicode_KIND(str1);
10493 int kind2 = PyUnicode_KIND(str2);
10494 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10495 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10496 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010497 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010498 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499
10500 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010503 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504
Victor Stinner59de0ee2011-10-07 10:01:28 +020010505 if (str1 == str2)
10506 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010509 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10510 if (maxchar < maxchar_str1)
10511 /* substring too wide to be present */
10512 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010513 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10514 /* Replacing str1 with str2 may cause a maxchar reduction in the
10515 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010516 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010517 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010520 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010522 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010525 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010526 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010527
Victor Stinner69ed0f42013-04-09 21:48:24 +020010528 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010530 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010532 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010536
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010537 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10538 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 }
10540 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 int rkind = skind;
10542 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010543 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (kind1 < rkind) {
10546 /* widen substring */
10547 buf1 = _PyUnicode_AsKind(str1, rkind);
10548 if (!buf1) goto error;
10549 release1 = 1;
10550 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010551 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552 if (i < 0)
10553 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (rkind > kind2) {
10555 /* widen replacement */
10556 buf2 = _PyUnicode_AsKind(str2, rkind);
10557 if (!buf2) goto error;
10558 release2 = 1;
10559 }
10560 else if (rkind < kind2) {
10561 /* widen self and buf1 */
10562 rkind = kind2;
10563 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010564 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 sbuf = _PyUnicode_AsKind(self, rkind);
10566 if (!sbuf) goto error;
10567 srelease = 1;
10568 buf1 = _PyUnicode_AsKind(str1, rkind);
10569 if (!buf1) goto error;
10570 release1 = 1;
10571 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 u = PyUnicode_New(slen, maxchar);
10573 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 assert(PyUnicode_KIND(u) == rkind);
10576 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010577
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010578 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010579 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010584
10585 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010586 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010587 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010589 if (i == -1)
10590 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010593 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 }
10598 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010600 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 int rkind = skind;
10602 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010605 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 buf1 = _PyUnicode_AsKind(str1, rkind);
10607 if (!buf1) goto error;
10608 release1 = 1;
10609 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010610 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 if (n == 0)
10612 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 buf2 = _PyUnicode_AsKind(str2, rkind);
10616 if (!buf2) goto error;
10617 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 rkind = kind2;
10622 sbuf = _PyUnicode_AsKind(self, rkind);
10623 if (!sbuf) goto error;
10624 srelease = 1;
10625 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010626 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 buf1 = _PyUnicode_AsKind(str1, rkind);
10628 if (!buf1) goto error;
10629 release1 = 1;
10630 }
10631 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10632 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010633 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 PyErr_SetString(PyExc_OverflowError,
10635 "replace string is too long");
10636 goto error;
10637 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010638 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010639 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010640 _Py_INCREF_UNICODE_EMPTY();
10641 if (!unicode_empty)
10642 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 u = unicode_empty;
10644 goto done;
10645 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010646 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 PyErr_SetString(PyExc_OverflowError,
10648 "replace string is too long");
10649 goto error;
10650 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 u = PyUnicode_New(new_size, maxchar);
10652 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010654 assert(PyUnicode_KIND(u) == rkind);
10655 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 ires = i = 0;
10657 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 while (n-- > 0) {
10659 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010660 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010662 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010663 if (j == -1)
10664 break;
10665 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
10668 sbuf + rkind * i,
10669 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 }
10672 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010674 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 memcpy(res + rkind * ires,
10684 sbuf + rkind * i,
10685 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010686 }
10687 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 /* interleave */
10689 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010690 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010692 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 if (--n <= 0)
10695 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010696 memcpy(res + rkind * ires,
10697 sbuf + rkind * i,
10698 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 ires++;
10700 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010701 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010702 memcpy(res + rkind * ires,
10703 sbuf + rkind * i,
10704 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010706 }
10707
10708 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010709 unicode_adjust_maxchar(&u);
10710 if (u == NULL)
10711 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010713
10714 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (srelease)
10716 PyMem_FREE(sbuf);
10717 if (release1)
10718 PyMem_FREE(buf1);
10719 if (release2)
10720 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010721 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010723
Benjamin Peterson29060642009-01-31 22:14:21 +000010724 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 if (srelease)
10727 PyMem_FREE(sbuf);
10728 if (release1)
10729 PyMem_FREE(buf1);
10730 if (release2)
10731 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010732 return unicode_result_unchanged(self);
10733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 error:
10735 if (srelease && sbuf)
10736 PyMem_FREE(sbuf);
10737 if (release1 && buf1)
10738 PyMem_FREE(buf1);
10739 if (release2 && buf2)
10740 PyMem_FREE(buf2);
10741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742}
10743
10744/* --- Unicode Object Methods --------------------------------------------- */
10745
INADA Naoki3ae20562017-01-16 20:41:20 +090010746/*[clinic input]
10747str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
INADA Naoki3ae20562017-01-16 20:41:20 +090010749Return a version of the string where each word is titlecased.
10750
10751More specifically, words start with uppercased characters and all remaining
10752cased characters have lower case.
10753[clinic start generated code]*/
10754
10755static PyObject *
10756unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010757/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010759 if (PyUnicode_READY(self) == -1)
10760 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010761 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762}
10763
INADA Naoki3ae20562017-01-16 20:41:20 +090010764/*[clinic input]
10765str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
INADA Naoki3ae20562017-01-16 20:41:20 +090010767Return a capitalized version of the string.
10768
10769More specifically, make the first character have upper case and the rest lower
10770case.
10771[clinic start generated code]*/
10772
10773static PyObject *
10774unicode_capitalize_impl(PyObject *self)
10775/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010777 if (PyUnicode_READY(self) == -1)
10778 return NULL;
10779 if (PyUnicode_GET_LENGTH(self) == 0)
10780 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010781 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782}
10783
INADA Naoki3ae20562017-01-16 20:41:20 +090010784/*[clinic input]
10785str.casefold as unicode_casefold
10786
10787Return a version of the string suitable for caseless comparisons.
10788[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010789
10790static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010791unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010792/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010793{
10794 if (PyUnicode_READY(self) == -1)
10795 return NULL;
10796 if (PyUnicode_IS_ASCII(self))
10797 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010798 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010799}
10800
10801
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010802/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010803
10804static int
10805convert_uc(PyObject *obj, void *addr)
10806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010809 if (!PyUnicode_Check(obj)) {
10810 PyErr_Format(PyExc_TypeError,
10811 "The fill character must be a unicode character, "
10812 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010813 return 0;
10814 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010815 if (PyUnicode_READY(obj) < 0)
10816 return 0;
10817 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010818 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010820 return 0;
10821 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010822 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010823 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010824}
10825
INADA Naoki3ae20562017-01-16 20:41:20 +090010826/*[clinic input]
10827str.center as unicode_center
10828
10829 width: Py_ssize_t
10830 fillchar: Py_UCS4 = ' '
10831 /
10832
10833Return a centered string of length width.
10834
10835Padding is done using the specified fill character (default is a space).
10836[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
10838static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010839unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10840/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010842 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
Benjamin Petersonbac79492012-01-14 13:34:47 -050010844 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 return NULL;
10846
Victor Stinnerc4b49542011-12-11 22:44:26 +010010847 if (PyUnicode_GET_LENGTH(self) >= width)
10848 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
Victor Stinnerc4b49542011-12-11 22:44:26 +010010850 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 left = marg / 2 + (marg & width & 1);
10852
Victor Stinner9310abb2011-10-05 00:59:23 +020010853 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854}
10855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856/* This function assumes that str1 and str2 are readied by the caller. */
10857
Marc-André Lemburge5034372000-08-08 08:04:29 +000010858static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010859unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010860{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861#define COMPARE(TYPE1, TYPE2) \
10862 do { \
10863 TYPE1* p1 = (TYPE1 *)data1; \
10864 TYPE2* p2 = (TYPE2 *)data2; \
10865 TYPE1* end = p1 + len; \
10866 Py_UCS4 c1, c2; \
10867 for (; p1 != end; p1++, p2++) { \
10868 c1 = *p1; \
10869 c2 = *p2; \
10870 if (c1 != c2) \
10871 return (c1 < c2) ? -1 : 1; \
10872 } \
10873 } \
10874 while (0)
10875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 int kind1, kind2;
10877 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 kind1 = PyUnicode_KIND(str1);
10881 kind2 = PyUnicode_KIND(str2);
10882 data1 = PyUnicode_DATA(str1);
10883 data2 = PyUnicode_DATA(str2);
10884 len1 = PyUnicode_GET_LENGTH(str1);
10885 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010886 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010887
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010888 switch(kind1) {
10889 case PyUnicode_1BYTE_KIND:
10890 {
10891 switch(kind2) {
10892 case PyUnicode_1BYTE_KIND:
10893 {
10894 int cmp = memcmp(data1, data2, len);
10895 /* normalize result of memcmp() into the range [-1; 1] */
10896 if (cmp < 0)
10897 return -1;
10898 if (cmp > 0)
10899 return 1;
10900 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010901 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 case PyUnicode_2BYTE_KIND:
10903 COMPARE(Py_UCS1, Py_UCS2);
10904 break;
10905 case PyUnicode_4BYTE_KIND:
10906 COMPARE(Py_UCS1, Py_UCS4);
10907 break;
10908 default:
10909 assert(0);
10910 }
10911 break;
10912 }
10913 case PyUnicode_2BYTE_KIND:
10914 {
10915 switch(kind2) {
10916 case PyUnicode_1BYTE_KIND:
10917 COMPARE(Py_UCS2, Py_UCS1);
10918 break;
10919 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010920 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010921 COMPARE(Py_UCS2, Py_UCS2);
10922 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924 case PyUnicode_4BYTE_KIND:
10925 COMPARE(Py_UCS2, Py_UCS4);
10926 break;
10927 default:
10928 assert(0);
10929 }
10930 break;
10931 }
10932 case PyUnicode_4BYTE_KIND:
10933 {
10934 switch(kind2) {
10935 case PyUnicode_1BYTE_KIND:
10936 COMPARE(Py_UCS4, Py_UCS1);
10937 break;
10938 case PyUnicode_2BYTE_KIND:
10939 COMPARE(Py_UCS4, Py_UCS2);
10940 break;
10941 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010942 {
10943#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10944 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10945 /* normalize result of wmemcmp() into the range [-1; 1] */
10946 if (cmp < 0)
10947 return -1;
10948 if (cmp > 0)
10949 return 1;
10950#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010952#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010953 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010954 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010955 default:
10956 assert(0);
10957 }
10958 break;
10959 }
10960 default:
10961 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010962 }
10963
Victor Stinner770e19e2012-10-04 22:59:45 +020010964 if (len1 == len2)
10965 return 0;
10966 if (len1 < len2)
10967 return -1;
10968 else
10969 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010970
10971#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010972}
10973
Benjamin Peterson621b4302016-09-09 13:54:34 -070010974static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010975unicode_compare_eq(PyObject *str1, PyObject *str2)
10976{
10977 int kind;
10978 void *data1, *data2;
10979 Py_ssize_t len;
10980 int cmp;
10981
Victor Stinnere5567ad2012-10-23 02:48:49 +020010982 len = PyUnicode_GET_LENGTH(str1);
10983 if (PyUnicode_GET_LENGTH(str2) != len)
10984 return 0;
10985 kind = PyUnicode_KIND(str1);
10986 if (PyUnicode_KIND(str2) != kind)
10987 return 0;
10988 data1 = PyUnicode_DATA(str1);
10989 data2 = PyUnicode_DATA(str2);
10990
10991 cmp = memcmp(data1, data2, len * kind);
10992 return (cmp == 0);
10993}
10994
10995
Alexander Belopolsky40018472011-02-26 01:02:56 +000010996int
10997PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11000 if (PyUnicode_READY(left) == -1 ||
11001 PyUnicode_READY(right) == -1)
11002 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011003
11004 /* a string is equal to itself */
11005 if (left == right)
11006 return 0;
11007
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011008 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011010 PyErr_Format(PyExc_TypeError,
11011 "Can't compare %.100s and %.100s",
11012 left->ob_type->tp_name,
11013 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 return -1;
11015}
11016
Martin v. Löwis5b222132007-06-10 09:51:05 +000011017int
11018PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 Py_ssize_t i;
11021 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011023 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024
Victor Stinner910337b2011-10-03 03:20:16 +020011025 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011026 if (!PyUnicode_IS_READY(uni)) {
11027 const wchar_t *ws = _PyUnicode_WSTR(uni);
11028 /* Compare Unicode string and source character set string */
11029 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11030 if (chr != ustr[i])
11031 return (chr < ustr[i]) ? -1 : 1;
11032 }
11033 /* This check keeps Python strings that end in '\0' from comparing equal
11034 to C strings identical up to that point. */
11035 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11036 return 1; /* uni is longer */
11037 if (ustr[i])
11038 return -1; /* str is longer */
11039 return 0;
11040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011042 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011043 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011044 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011045 size_t len, len2 = strlen(str);
11046 int cmp;
11047
11048 len = Py_MIN(len1, len2);
11049 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011050 if (cmp != 0) {
11051 if (cmp < 0)
11052 return -1;
11053 else
11054 return 1;
11055 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011056 if (len1 > len2)
11057 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011058 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011059 return -1; /* str is longer */
11060 return 0;
11061 }
11062 else {
11063 void *data = PyUnicode_DATA(uni);
11064 /* Compare Unicode string and source character set string */
11065 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011066 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011067 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11068 /* This check keeps Python strings that end in '\0' from comparing equal
11069 to C strings identical up to that point. */
11070 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11071 return 1; /* uni is longer */
11072 if (str[i])
11073 return -1; /* str is longer */
11074 return 0;
11075 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011076}
11077
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011078static int
11079non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11080{
11081 size_t i, len;
11082 const wchar_t *p;
11083 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11084 if (strlen(str) != len)
11085 return 0;
11086 p = _PyUnicode_WSTR(unicode);
11087 assert(p);
11088 for (i = 0; i < len; i++) {
11089 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011090 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011091 return 0;
11092 }
11093 return 1;
11094}
11095
11096int
11097_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11098{
11099 size_t len;
11100 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011101 assert(str);
11102#ifndef NDEBUG
11103 for (const char *p = str; *p; p++) {
11104 assert((unsigned char)*p < 128);
11105 }
11106#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011107 if (PyUnicode_READY(unicode) == -1) {
11108 /* Memory error or bad data */
11109 PyErr_Clear();
11110 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11111 }
11112 if (!PyUnicode_IS_ASCII(unicode))
11113 return 0;
11114 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11115 return strlen(str) == len &&
11116 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11117}
11118
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011119int
11120_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11121{
11122 PyObject *right_uni;
11123 Py_hash_t hash;
11124
11125 assert(_PyUnicode_CHECK(left));
11126 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011127#ifndef NDEBUG
11128 for (const char *p = right->string; *p; p++) {
11129 assert((unsigned char)*p < 128);
11130 }
11131#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011132
11133 if (PyUnicode_READY(left) == -1) {
11134 /* memory error or bad data */
11135 PyErr_Clear();
11136 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11137 }
11138
11139 if (!PyUnicode_IS_ASCII(left))
11140 return 0;
11141
11142 right_uni = _PyUnicode_FromId(right); /* borrowed */
11143 if (right_uni == NULL) {
11144 /* memory error or bad data */
11145 PyErr_Clear();
11146 return _PyUnicode_EqualToASCIIString(left, right->string);
11147 }
11148
11149 if (left == right_uni)
11150 return 1;
11151
11152 if (PyUnicode_CHECK_INTERNED(left))
11153 return 0;
11154
11155 assert(_PyUnicode_HASH(right_uni) != 1);
11156 hash = _PyUnicode_HASH(left);
11157 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11158 return 0;
11159
11160 return unicode_compare_eq(left, right_uni);
11161}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011162
Benjamin Peterson29060642009-01-31 22:14:21 +000011163#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011164 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011165
Alexander Belopolsky40018472011-02-26 01:02:56 +000011166PyObject *
11167PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011168{
11169 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011170 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011171
Victor Stinnere5567ad2012-10-23 02:48:49 +020011172 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11173 Py_RETURN_NOTIMPLEMENTED;
11174
11175 if (PyUnicode_READY(left) == -1 ||
11176 PyUnicode_READY(right) == -1)
11177 return NULL;
11178
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011179 if (left == right) {
11180 switch (op) {
11181 case Py_EQ:
11182 case Py_LE:
11183 case Py_GE:
11184 /* a string is equal to itself */
11185 v = Py_True;
11186 break;
11187 case Py_NE:
11188 case Py_LT:
11189 case Py_GT:
11190 v = Py_False;
11191 break;
11192 default:
11193 PyErr_BadArgument();
11194 return NULL;
11195 }
11196 }
11197 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011198 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011199 result ^= (op == Py_NE);
11200 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011201 }
11202 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011203 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011204
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011205 /* Convert the return value to a Boolean */
11206 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011207 case Py_LE:
11208 v = TEST_COND(result <= 0);
11209 break;
11210 case Py_GE:
11211 v = TEST_COND(result >= 0);
11212 break;
11213 case Py_LT:
11214 v = TEST_COND(result == -1);
11215 break;
11216 case Py_GT:
11217 v = TEST_COND(result == 1);
11218 break;
11219 default:
11220 PyErr_BadArgument();
11221 return NULL;
11222 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011223 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011224 Py_INCREF(v);
11225 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011226}
11227
Alexander Belopolsky40018472011-02-26 01:02:56 +000011228int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011229_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11230{
11231 return unicode_eq(aa, bb);
11232}
11233
11234int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011236{
Victor Stinner77282cb2013-04-14 19:22:47 +020011237 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 void *buf1, *buf2;
11239 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011240 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011241
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 "'in <string>' requires string as left operand, not %.100s",
11245 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011246 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011247 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011248 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011249 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 if (ensure_unicode(str) < 0)
11251 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011254 kind2 = PyUnicode_KIND(substr);
11255 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011256 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011258 len2 = PyUnicode_GET_LENGTH(substr);
11259 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011260 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011261 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011262 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011263 if (len2 == 1) {
11264 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11265 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011266 return result;
11267 }
11268 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011269 buf2 = _PyUnicode_AsKind(substr, kind1);
11270 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011271 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273
Victor Stinner77282cb2013-04-14 19:22:47 +020011274 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 case PyUnicode_1BYTE_KIND:
11276 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11277 break;
11278 case PyUnicode_2BYTE_KIND:
11279 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11280 break;
11281 case PyUnicode_4BYTE_KIND:
11282 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11283 break;
11284 default:
11285 result = -1;
11286 assert(0);
11287 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011288
Victor Stinner77282cb2013-04-14 19:22:47 +020011289 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 PyMem_Free(buf2);
11291
Guido van Rossum403d68b2000-03-13 15:55:09 +000011292 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011293}
11294
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295/* Concat to string or Unicode object giving a new Unicode object. */
11296
Alexander Belopolsky40018472011-02-26 01:02:56 +000011297PyObject *
11298PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011300 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011301 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011304 if (ensure_unicode(left) < 0)
11305 return NULL;
11306
11307 if (!PyUnicode_Check(right)) {
11308 PyErr_Format(PyExc_TypeError,
11309 "can only concatenate str (not \"%.200s\") to str",
11310 right->ob_type->tp_name);
11311 return NULL;
11312 }
11313 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
11316 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011317 if (left == unicode_empty)
11318 return PyUnicode_FromObject(right);
11319 if (right == unicode_empty)
11320 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011322 left_len = PyUnicode_GET_LENGTH(left);
11323 right_len = PyUnicode_GET_LENGTH(right);
11324 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011325 PyErr_SetString(PyExc_OverflowError,
11326 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011327 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011328 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011329 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011330
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011331 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11332 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011333 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011336 result = PyUnicode_New(new_len, maxchar);
11337 if (result == NULL)
11338 return NULL;
11339 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11340 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11341 assert(_PyUnicode_CheckConsistency(result, 1));
11342 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343}
11344
Walter Dörwald1ab83302007-05-18 17:15:44 +000011345void
Victor Stinner23e56682011-10-03 03:54:37 +020011346PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011347{
Victor Stinner23e56682011-10-03 03:54:37 +020011348 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011349 Py_UCS4 maxchar, maxchar2;
11350 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011351
11352 if (p_left == NULL) {
11353 if (!PyErr_Occurred())
11354 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011355 return;
11356 }
Victor Stinner23e56682011-10-03 03:54:37 +020011357 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011358 if (right == NULL || left == NULL
11359 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011360 if (!PyErr_Occurred())
11361 PyErr_BadInternalCall();
11362 goto error;
11363 }
11364
Benjamin Petersonbac79492012-01-14 13:34:47 -050011365 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011366 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011367 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011368 goto error;
11369
Victor Stinner488fa492011-12-12 00:01:39 +010011370 /* Shortcuts */
11371 if (left == unicode_empty) {
11372 Py_DECREF(left);
11373 Py_INCREF(right);
11374 *p_left = right;
11375 return;
11376 }
11377 if (right == unicode_empty)
11378 return;
11379
11380 left_len = PyUnicode_GET_LENGTH(left);
11381 right_len = PyUnicode_GET_LENGTH(right);
11382 if (left_len > PY_SSIZE_T_MAX - right_len) {
11383 PyErr_SetString(PyExc_OverflowError,
11384 "strings are too large to concat");
11385 goto error;
11386 }
11387 new_len = left_len + right_len;
11388
11389 if (unicode_modifiable(left)
11390 && PyUnicode_CheckExact(right)
11391 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011392 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11393 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011394 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011395 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011396 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11397 {
11398 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011399 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011400 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011401
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011402 /* copy 'right' into the newly allocated area of 'left' */
11403 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011404 }
Victor Stinner488fa492011-12-12 00:01:39 +010011405 else {
11406 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11407 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011408 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011409
Victor Stinner488fa492011-12-12 00:01:39 +010011410 /* Concat the two Unicode strings */
11411 res = PyUnicode_New(new_len, maxchar);
11412 if (res == NULL)
11413 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011414 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11415 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011416 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011417 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011418 }
11419 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011420 return;
11421
11422error:
Victor Stinner488fa492011-12-12 00:01:39 +010011423 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011424}
11425
11426void
11427PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11428{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011429 PyUnicode_Append(pleft, right);
11430 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011431}
11432
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011433/*
11434Wraps stringlib_parse_args_finds() and additionally ensures that the
11435first argument is a unicode object.
11436*/
11437
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011438static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011439parse_args_finds_unicode(const char * function_name, PyObject *args,
11440 PyObject **substring,
11441 Py_ssize_t *start, Py_ssize_t *end)
11442{
11443 if(stringlib_parse_args_finds(function_name, args, substring,
11444 start, end)) {
11445 if (ensure_unicode(*substring) < 0)
11446 return 0;
11447 return 1;
11448 }
11449 return 0;
11450}
11451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011452PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011455Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011456string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458
11459static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011460unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011462 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011463 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011464 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011466 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 void *buf1, *buf2;
11468 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011470 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 kind1 = PyUnicode_KIND(self);
11474 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011475 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011476 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 len1 = PyUnicode_GET_LENGTH(self);
11479 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011481 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011482 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011483
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011484 buf1 = PyUnicode_DATA(self);
11485 buf2 = PyUnicode_DATA(substring);
11486 if (kind2 != kind1) {
11487 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011488 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011489 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011490 }
11491 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 case PyUnicode_1BYTE_KIND:
11493 iresult = ucs1lib_count(
11494 ((Py_UCS1*)buf1) + start, end - start,
11495 buf2, len2, PY_SSIZE_T_MAX
11496 );
11497 break;
11498 case PyUnicode_2BYTE_KIND:
11499 iresult = ucs2lib_count(
11500 ((Py_UCS2*)buf1) + start, end - start,
11501 buf2, len2, PY_SSIZE_T_MAX
11502 );
11503 break;
11504 case PyUnicode_4BYTE_KIND:
11505 iresult = ucs4lib_count(
11506 ((Py_UCS4*)buf1) + start, end - start,
11507 buf2, len2, PY_SSIZE_T_MAX
11508 );
11509 break;
11510 default:
11511 assert(0); iresult = 0;
11512 }
11513
11514 result = PyLong_FromSsize_t(iresult);
11515
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011516 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 return result;
11520}
11521
INADA Naoki3ae20562017-01-16 20:41:20 +090011522/*[clinic input]
11523str.encode as unicode_encode
11524
11525 encoding: str(c_default="NULL") = 'utf-8'
11526 The encoding in which to encode the string.
11527 errors: str(c_default="NULL") = 'strict'
11528 The error handling scheme to use for encoding errors.
11529 The default is 'strict' meaning that encoding errors raise a
11530 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11531 'xmlcharrefreplace' as well as any other name registered with
11532 codecs.register_error that can handle UnicodeEncodeErrors.
11533
11534Encode the string using the codec registered for encoding.
11535[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011538unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011539/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011541 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011542}
11543
INADA Naoki3ae20562017-01-16 20:41:20 +090011544/*[clinic input]
11545str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
INADA Naoki3ae20562017-01-16 20:41:20 +090011547 tabsize: int = 8
11548
11549Return a copy where all tab characters are expanded using spaces.
11550
11551If tabsize is not given, a tab size of 8 characters is assumed.
11552[clinic start generated code]*/
11553
11554static PyObject *
11555unicode_expandtabs_impl(PyObject *self, int tabsize)
11556/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011558 Py_ssize_t i, j, line_pos, src_len, incr;
11559 Py_UCS4 ch;
11560 PyObject *u;
11561 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011562 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011563 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
Antoine Pitrou22425222011-10-04 19:10:51 +020011565 if (PyUnicode_READY(self) == -1)
11566 return NULL;
11567
Thomas Wouters7e474022000-07-16 12:04:32 +000011568 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011569 src_len = PyUnicode_GET_LENGTH(self);
11570 i = j = line_pos = 0;
11571 kind = PyUnicode_KIND(self);
11572 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011573 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011574 for (; i < src_len; i++) {
11575 ch = PyUnicode_READ(kind, src_data, i);
11576 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011577 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011581 goto overflow;
11582 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011584 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011588 goto overflow;
11589 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 if (ch == '\n' || ch == '\r')
11592 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011595 if (!found)
11596 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011597
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600 if (!u)
11601 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011602 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Antoine Pitroue71d5742011-10-04 15:55:09 +020011606 for (; i < src_len; i++) {
11607 ch = PyUnicode_READ(kind, src_data, i);
11608 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011609 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011610 incr = tabsize - (line_pos % tabsize);
11611 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011612 FILL(kind, dest_data, ' ', j, incr);
11613 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011615 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011617 line_pos++;
11618 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011619 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 if (ch == '\n' || ch == '\r')
11621 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011623 }
11624 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011625 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011626
Antoine Pitroue71d5742011-10-04 15:55:09 +020011627 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011628 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630}
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634\n\
11635Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011636such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637arguments start and end are interpreted as in slice notation.\n\
11638\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011639Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
11641static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011644 /* initialize variables to prevent gcc warning */
11645 PyObject *substring = NULL;
11646 Py_ssize_t start = 0;
11647 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011648 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011650 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011653 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011656 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 if (result == -2)
11659 return NULL;
11660
Christian Heimes217cfd12007-12-02 14:31:20 +000011661 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662}
11663
11664static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011665unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011667 void *data;
11668 enum PyUnicode_Kind kind;
11669 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011670
11671 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11672 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011674 }
11675 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11676 PyErr_SetString(PyExc_IndexError, "string index out of range");
11677 return NULL;
11678 }
11679 kind = PyUnicode_KIND(self);
11680 data = PyUnicode_DATA(self);
11681 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011682 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683}
11684
Guido van Rossumc2504932007-09-18 19:42:40 +000011685/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011686 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011687static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011688unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689{
Guido van Rossumc2504932007-09-18 19:42:40 +000011690 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011691 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011692
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011693#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011694 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011695#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (_PyUnicode_HASH(self) != -1)
11697 return _PyUnicode_HASH(self);
11698 if (PyUnicode_READY(self) == -1)
11699 return -1;
11700 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011701 /*
11702 We make the hash of the empty string be 0, rather than using
11703 (prefix ^ suffix), since this slightly obfuscates the hash secret
11704 */
11705 if (len == 0) {
11706 _PyUnicode_HASH(self) = 0;
11707 return 0;
11708 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011709 x = _Py_HashBytes(PyUnicode_DATA(self),
11710 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011712 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713}
11714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011715PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011718Return the lowest index in S where substring sub is found, \n\
11719such that sub is contained within S[start:end]. Optional\n\
11720arguments start and end are interpreted as in slice notation.\n\
11721\n\
11722Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
11724static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011727 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011728 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011729 PyObject *substring = NULL;
11730 Py_ssize_t start = 0;
11731 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011733 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011736 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011739 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (result == -2)
11742 return NULL;
11743
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 if (result < 0) {
11745 PyErr_SetString(PyExc_ValueError, "substring not found");
11746 return NULL;
11747 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011748
Christian Heimes217cfd12007-12-02 14:31:20 +000011749 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750}
11751
INADA Naoki3ae20562017-01-16 20:41:20 +090011752/*[clinic input]
11753str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
INADA Naoki3ae20562017-01-16 20:41:20 +090011755Return True if the string is a lowercase string, False otherwise.
11756
11757A string is lowercase if all cased characters in the string are lowercase and
11758there is at least one cased character in the string.
11759[clinic start generated code]*/
11760
11761static PyObject *
11762unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011763/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 Py_ssize_t i, length;
11766 int kind;
11767 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 int cased;
11769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (PyUnicode_READY(self) == -1)
11771 return NULL;
11772 length = PyUnicode_GET_LENGTH(self);
11773 kind = PyUnicode_KIND(self);
11774 data = PyUnicode_DATA(self);
11775
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (length == 1)
11778 return PyBool_FromLong(
11779 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011781 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011783 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 for (i = 0; i < length; i++) {
11787 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011788
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011790 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 else if (!cased && Py_UNICODE_ISLOWER(ch))
11792 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011794 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795}
11796
INADA Naoki3ae20562017-01-16 20:41:20 +090011797/*[clinic input]
11798str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
INADA Naoki3ae20562017-01-16 20:41:20 +090011800Return True if the string is an uppercase string, False otherwise.
11801
11802A string is uppercase if all cased characters in the string are uppercase and
11803there is at least one cased character in the string.
11804[clinic start generated code]*/
11805
11806static PyObject *
11807unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011808/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 Py_ssize_t i, length;
11811 int kind;
11812 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 int cased;
11814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (PyUnicode_READY(self) == -1)
11816 return NULL;
11817 length = PyUnicode_GET_LENGTH(self);
11818 kind = PyUnicode_KIND(self);
11819 data = PyUnicode_DATA(self);
11820
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 if (length == 1)
11823 return PyBool_FromLong(
11824 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011826 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011828 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 for (i = 0; i < length; i++) {
11832 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011833
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011835 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 else if (!cased && Py_UNICODE_ISUPPER(ch))
11837 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011839 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840}
11841
INADA Naoki3ae20562017-01-16 20:41:20 +090011842/*[clinic input]
11843str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
INADA Naoki3ae20562017-01-16 20:41:20 +090011845Return True if the string is a title-cased string, False otherwise.
11846
11847In a title-cased string, upper- and title-case characters may only
11848follow uncased characters and lowercase characters only cased ones.
11849[clinic start generated code]*/
11850
11851static PyObject *
11852unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011853/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 Py_ssize_t i, length;
11856 int kind;
11857 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 int cased, previous_is_cased;
11859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (PyUnicode_READY(self) == -1)
11861 return NULL;
11862 length = PyUnicode_GET_LENGTH(self);
11863 kind = PyUnicode_KIND(self);
11864 data = PyUnicode_DATA(self);
11865
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (length == 1) {
11868 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11869 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11870 (Py_UNICODE_ISUPPER(ch) != 0));
11871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011873 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011875 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011876
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877 cased = 0;
11878 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 for (i = 0; i < length; i++) {
11880 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011881
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11883 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011884 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011885 previous_is_cased = 1;
11886 cased = 1;
11887 }
11888 else if (Py_UNICODE_ISLOWER(ch)) {
11889 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011890 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 previous_is_cased = 1;
11892 cased = 1;
11893 }
11894 else
11895 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011897 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898}
11899
INADA Naoki3ae20562017-01-16 20:41:20 +090011900/*[clinic input]
11901str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
INADA Naoki3ae20562017-01-16 20:41:20 +090011903Return True if the string is a whitespace string, False otherwise.
11904
11905A string is whitespace if all characters in the string are whitespace and there
11906is at least one character in the string.
11907[clinic start generated code]*/
11908
11909static PyObject *
11910unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011911/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 Py_ssize_t i, length;
11914 int kind;
11915 void *data;
11916
11917 if (PyUnicode_READY(self) == -1)
11918 return NULL;
11919 length = PyUnicode_GET_LENGTH(self);
11920 kind = PyUnicode_KIND(self);
11921 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 if (length == 1)
11925 return PyBool_FromLong(
11926 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011928 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011930 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 for (i = 0; i < length; i++) {
11933 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011934 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011935 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011937 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
INADA Naoki3ae20562017-01-16 20:41:20 +090011940/*[clinic input]
11941str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011942
INADA Naoki3ae20562017-01-16 20:41:20 +090011943Return True if the string is an alphabetic string, False otherwise.
11944
11945A string is alphabetic if all characters in the string are alphabetic and there
11946is at least one character in the string.
11947[clinic start generated code]*/
11948
11949static PyObject *
11950unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011951/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 Py_ssize_t i, length;
11954 int kind;
11955 void *data;
11956
11957 if (PyUnicode_READY(self) == -1)
11958 return NULL;
11959 length = PyUnicode_GET_LENGTH(self);
11960 kind = PyUnicode_KIND(self);
11961 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011963 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 if (length == 1)
11965 return PyBool_FromLong(
11966 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967
11968 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011970 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 for (i = 0; i < length; i++) {
11973 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011974 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011975 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011976 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011977}
11978
INADA Naoki3ae20562017-01-16 20:41:20 +090011979/*[clinic input]
11980str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981
INADA Naoki3ae20562017-01-16 20:41:20 +090011982Return True if the string is an alpha-numeric string, False otherwise.
11983
11984A string is alpha-numeric if all characters in the string are alpha-numeric and
11985there is at least one character in the string.
11986[clinic start generated code]*/
11987
11988static PyObject *
11989unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011990/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 int kind;
11993 void *data;
11994 Py_ssize_t len, i;
11995
11996 if (PyUnicode_READY(self) == -1)
11997 return NULL;
11998
11999 kind = PyUnicode_KIND(self);
12000 data = PyUnicode_DATA(self);
12001 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012003 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (len == 1) {
12005 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12006 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12007 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012008
12009 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012011 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 for (i = 0; i < len; i++) {
12014 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012015 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012016 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012017 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012018 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012019}
12020
INADA Naoki3ae20562017-01-16 20:41:20 +090012021/*[clinic input]
12022str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023
INADA Naoki3ae20562017-01-16 20:41:20 +090012024Return True if the string is a decimal string, False otherwise.
12025
12026A string is a decimal string if all characters in the string are decimal and
12027there is at least one character in the string.
12028[clinic start generated code]*/
12029
12030static PyObject *
12031unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012032/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 Py_ssize_t i, length;
12035 int kind;
12036 void *data;
12037
12038 if (PyUnicode_READY(self) == -1)
12039 return NULL;
12040 length = PyUnicode_GET_LENGTH(self);
12041 kind = PyUnicode_KIND(self);
12042 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 if (length == 1)
12046 return PyBool_FromLong(
12047 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012049 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012051 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 for (i = 0; i < length; i++) {
12054 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012055 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012057 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058}
12059
INADA Naoki3ae20562017-01-16 20:41:20 +090012060/*[clinic input]
12061str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
INADA Naoki3ae20562017-01-16 20:41:20 +090012063Return True if the string is a digit string, False otherwise.
12064
12065A string is a digit string if all characters in the string are digits and there
12066is at least one character in the string.
12067[clinic start generated code]*/
12068
12069static PyObject *
12070unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012071/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 Py_ssize_t i, length;
12074 int kind;
12075 void *data;
12076
12077 if (PyUnicode_READY(self) == -1)
12078 return NULL;
12079 length = PyUnicode_GET_LENGTH(self);
12080 kind = PyUnicode_KIND(self);
12081 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 if (length == 1) {
12085 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12086 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012089 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012091 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 for (i = 0; i < length; i++) {
12094 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012095 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012097 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
INADA Naoki3ae20562017-01-16 20:41:20 +090012100/*[clinic input]
12101str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
INADA Naoki3ae20562017-01-16 20:41:20 +090012103Return True if the string is a numeric string, False otherwise.
12104
12105A string is numeric if all characters in the string are numeric and there is at
12106least one character in the string.
12107[clinic start generated code]*/
12108
12109static PyObject *
12110unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012111/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 Py_ssize_t i, length;
12114 int kind;
12115 void *data;
12116
12117 if (PyUnicode_READY(self) == -1)
12118 return NULL;
12119 length = PyUnicode_GET_LENGTH(self);
12120 kind = PyUnicode_KIND(self);
12121 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 if (length == 1)
12125 return PyBool_FromLong(
12126 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012128 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012130 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 for (i = 0; i < length; i++) {
12133 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012134 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012136 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137}
12138
Martin v. Löwis47383402007-08-15 07:32:56 +000012139int
12140PyUnicode_IsIdentifier(PyObject *self)
12141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 int kind;
12143 void *data;
12144 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012145 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 if (PyUnicode_READY(self) == -1) {
12148 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 }
12151
12152 /* Special case for empty strings */
12153 if (PyUnicode_GET_LENGTH(self) == 0)
12154 return 0;
12155 kind = PyUnicode_KIND(self);
12156 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012157
12158 /* PEP 3131 says that the first character must be in
12159 XID_Start and subsequent characters in XID_Continue,
12160 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012162 letters, digits, underscore). However, given the current
12163 definition of XID_Start and XID_Continue, it is sufficient
12164 to check just for these, except that _ must be allowed
12165 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012167 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012168 return 0;
12169
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012170 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012173 return 1;
12174}
12175
INADA Naoki3ae20562017-01-16 20:41:20 +090012176/*[clinic input]
12177str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012178
INADA Naoki3ae20562017-01-16 20:41:20 +090012179Return True if the string is a valid Python identifier, False otherwise.
12180
12181Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12182"class".
12183[clinic start generated code]*/
12184
12185static PyObject *
12186unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012187/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012188{
12189 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12190}
12191
INADA Naoki3ae20562017-01-16 20:41:20 +090012192/*[clinic input]
12193str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012194
INADA Naoki3ae20562017-01-16 20:41:20 +090012195Return True if the string is printable, False otherwise.
12196
12197A string is printable if all of its characters are considered printable in
12198repr() or if it is empty.
12199[clinic start generated code]*/
12200
12201static PyObject *
12202unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012203/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 Py_ssize_t i, length;
12206 int kind;
12207 void *data;
12208
12209 if (PyUnicode_READY(self) == -1)
12210 return NULL;
12211 length = PyUnicode_GET_LENGTH(self);
12212 kind = PyUnicode_KIND(self);
12213 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012214
12215 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (length == 1)
12217 return PyBool_FromLong(
12218 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 for (i = 0; i < length; i++) {
12221 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012222 Py_RETURN_FALSE;
12223 }
12224 }
12225 Py_RETURN_TRUE;
12226}
12227
INADA Naoki3ae20562017-01-16 20:41:20 +090012228/*[clinic input]
12229str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230
INADA Naoki3ae20562017-01-16 20:41:20 +090012231 iterable: object
12232 /
12233
12234Concatenate any number of strings.
12235
Martin Panter91a88662017-01-24 00:30:06 +000012236The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012237The result is returned as a new string.
12238
12239Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12240[clinic start generated code]*/
12241
12242static PyObject *
12243unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012244/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245{
INADA Naoki3ae20562017-01-16 20:41:20 +090012246 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247}
12248
Martin v. Löwis18e16552006-02-15 17:27:45 +000012249static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012250unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 if (PyUnicode_READY(self) == -1)
12253 return -1;
12254 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255}
12256
INADA Naoki3ae20562017-01-16 20:41:20 +090012257/*[clinic input]
12258str.ljust as unicode_ljust
12259
12260 width: Py_ssize_t
12261 fillchar: Py_UCS4 = ' '
12262 /
12263
12264Return a left-justified string of length width.
12265
12266Padding is done using the specified fill character (default is a space).
12267[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
12269static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012270unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12271/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012273 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275
Victor Stinnerc4b49542011-12-11 22:44:26 +010012276 if (PyUnicode_GET_LENGTH(self) >= width)
12277 return unicode_result_unchanged(self);
12278
12279 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280}
12281
INADA Naoki3ae20562017-01-16 20:41:20 +090012282/*[clinic input]
12283str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284
INADA Naoki3ae20562017-01-16 20:41:20 +090012285Return a copy of the string converted to lowercase.
12286[clinic start generated code]*/
12287
12288static PyObject *
12289unicode_lower_impl(PyObject *self)
12290/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012292 if (PyUnicode_READY(self) == -1)
12293 return NULL;
12294 if (PyUnicode_IS_ASCII(self))
12295 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012296 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297}
12298
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299#define LEFTSTRIP 0
12300#define RIGHTSTRIP 1
12301#define BOTHSTRIP 2
12302
12303/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012304static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012305
INADA Naoki3ae20562017-01-16 20:41:20 +090012306#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012308/* externally visible for str.strip(unicode) */
12309PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012310_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 void *data;
12313 int kind;
12314 Py_ssize_t i, j, len;
12315 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012316 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12319 return NULL;
12320
12321 kind = PyUnicode_KIND(self);
12322 data = PyUnicode_DATA(self);
12323 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012324 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12326 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012327 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012328
Benjamin Peterson14339b62009-01-31 16:36:08 +000012329 i = 0;
12330 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012331 while (i < len) {
12332 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12333 if (!BLOOM(sepmask, ch))
12334 break;
12335 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12336 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012337 i++;
12338 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340
Benjamin Peterson14339b62009-01-31 16:36:08 +000012341 j = len;
12342 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012343 j--;
12344 while (j >= i) {
12345 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12346 if (!BLOOM(sepmask, ch))
12347 break;
12348 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12349 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012351 }
12352
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012354 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012355
Victor Stinner7931d9a2011-11-04 00:22:48 +010012356 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357}
12358
12359PyObject*
12360PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12361{
12362 unsigned char *data;
12363 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012364 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365
Victor Stinnerde636f32011-10-01 03:55:54 +020012366 if (PyUnicode_READY(self) == -1)
12367 return NULL;
12368
Victor Stinner684d5fd2012-05-03 02:32:34 +020012369 length = PyUnicode_GET_LENGTH(self);
12370 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012371
Victor Stinner684d5fd2012-05-03 02:32:34 +020012372 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012373 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374
Victor Stinnerde636f32011-10-01 03:55:54 +020012375 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012376 PyErr_SetString(PyExc_IndexError, "string index out of range");
12377 return NULL;
12378 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012379 if (start >= length || end < start)
12380 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012381
Victor Stinner684d5fd2012-05-03 02:32:34 +020012382 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012383 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012384 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012385 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012386 }
12387 else {
12388 kind = PyUnicode_KIND(self);
12389 data = PyUnicode_1BYTE_DATA(self);
12390 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012391 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012392 length);
12393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395
12396static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012397do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 Py_ssize_t len, i, j;
12400
12401 if (PyUnicode_READY(self) == -1)
12402 return NULL;
12403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012405
Victor Stinnercc7af722013-04-09 22:39:24 +020012406 if (PyUnicode_IS_ASCII(self)) {
12407 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12408
12409 i = 0;
12410 if (striptype != RIGHTSTRIP) {
12411 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012412 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012413 if (!_Py_ascii_whitespace[ch])
12414 break;
12415 i++;
12416 }
12417 }
12418
12419 j = len;
12420 if (striptype != LEFTSTRIP) {
12421 j--;
12422 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012423 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012424 if (!_Py_ascii_whitespace[ch])
12425 break;
12426 j--;
12427 }
12428 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012429 }
12430 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012431 else {
12432 int kind = PyUnicode_KIND(self);
12433 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012434
Victor Stinnercc7af722013-04-09 22:39:24 +020012435 i = 0;
12436 if (striptype != RIGHTSTRIP) {
12437 while (i < len) {
12438 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12439 if (!Py_UNICODE_ISSPACE(ch))
12440 break;
12441 i++;
12442 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012443 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012444
12445 j = len;
12446 if (striptype != LEFTSTRIP) {
12447 j--;
12448 while (j >= i) {
12449 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12450 if (!Py_UNICODE_ISSPACE(ch))
12451 break;
12452 j--;
12453 }
12454 j++;
12455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457
Victor Stinner7931d9a2011-11-04 00:22:48 +010012458 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459}
12460
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461
12462static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012463do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012465 if (sep != NULL && sep != Py_None) {
12466 if (PyUnicode_Check(sep))
12467 return _PyUnicode_XStrip(self, striptype, sep);
12468 else {
12469 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "%s arg must be None or str",
12471 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012472 return NULL;
12473 }
12474 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012475
Benjamin Peterson14339b62009-01-31 16:36:08 +000012476 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477}
12478
12479
INADA Naoki3ae20562017-01-16 20:41:20 +090012480/*[clinic input]
12481str.strip as unicode_strip
12482
12483 chars: object = None
12484 /
12485
Victor Stinner0c4a8282017-01-17 02:21:47 +010012486Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012487
12488If chars is given and not None, remove characters in chars instead.
12489[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012490
12491static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012492unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012493/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494{
INADA Naoki3ae20562017-01-16 20:41:20 +090012495 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496}
12497
12498
INADA Naoki3ae20562017-01-16 20:41:20 +090012499/*[clinic input]
12500str.lstrip as unicode_lstrip
12501
12502 chars: object = NULL
12503 /
12504
12505Return a copy of the string with leading whitespace removed.
12506
12507If chars is given and not None, remove characters in chars instead.
12508[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012509
12510static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012511unicode_lstrip_impl(PyObject *self, PyObject *chars)
12512/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513{
INADA Naoki3ae20562017-01-16 20:41:20 +090012514 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012515}
12516
12517
INADA Naoki3ae20562017-01-16 20:41:20 +090012518/*[clinic input]
12519str.rstrip as unicode_rstrip
12520
12521 chars: object = NULL
12522 /
12523
12524Return a copy of the string with trailing whitespace removed.
12525
12526If chars is given and not None, remove characters in chars instead.
12527[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012528
12529static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012530unicode_rstrip_impl(PyObject *self, PyObject *chars)
12531/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012532{
INADA Naoki3ae20562017-01-16 20:41:20 +090012533 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012534}
12535
12536
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012538unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012540 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
Serhiy Storchaka05997252013-01-26 12:14:02 +020012543 if (len < 1)
12544 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
Victor Stinnerc4b49542011-12-11 22:44:26 +010012546 /* no repeat, return original string */
12547 if (len == 1)
12548 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012549
Benjamin Petersonbac79492012-01-14 13:34:47 -050012550 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 return NULL;
12552
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012553 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012554 PyErr_SetString(PyExc_OverflowError,
12555 "repeated string is too long");
12556 return NULL;
12557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012559
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012560 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 if (!u)
12562 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012563 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 if (PyUnicode_GET_LENGTH(str) == 1) {
12566 const int kind = PyUnicode_KIND(str);
12567 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012568 if (kind == PyUnicode_1BYTE_KIND) {
12569 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012570 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012571 }
12572 else if (kind == PyUnicode_2BYTE_KIND) {
12573 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012574 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012575 ucs2[n] = fill_char;
12576 } else {
12577 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12578 assert(kind == PyUnicode_4BYTE_KIND);
12579 for (n = 0; n < len; ++n)
12580 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 }
12583 else {
12584 /* number of characters copied this far */
12585 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012586 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012588 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012592 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012593 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595 }
12596
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012597 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012598 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599}
12600
Alexander Belopolsky40018472011-02-26 01:02:56 +000012601PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012602PyUnicode_Replace(PyObject *str,
12603 PyObject *substr,
12604 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012605 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012607 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12608 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012610 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611}
12612
INADA Naoki3ae20562017-01-16 20:41:20 +090012613/*[clinic input]
12614str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615
INADA Naoki3ae20562017-01-16 20:41:20 +090012616 old: unicode
12617 new: unicode
12618 count: Py_ssize_t = -1
12619 Maximum number of occurrences to replace.
12620 -1 (the default value) means replace all occurrences.
12621 /
12622
12623Return a copy with all occurrences of substring old replaced by new.
12624
12625If the optional argument count is given, only the first count occurrences are
12626replaced.
12627[clinic start generated code]*/
12628
12629static PyObject *
12630unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12631 Py_ssize_t count)
12632/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012634 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012636 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637}
12638
Alexander Belopolsky40018472011-02-26 01:02:56 +000012639static PyObject *
12640unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012642 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 Py_ssize_t isize;
12644 Py_ssize_t osize, squote, dquote, i, o;
12645 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012646 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012650 return NULL;
12651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 isize = PyUnicode_GET_LENGTH(unicode);
12653 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 /* Compute length of output, quote characters, and
12656 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012657 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 max = 127;
12659 squote = dquote = 0;
12660 ikind = PyUnicode_KIND(unicode);
12661 for (i = 0; i < isize; i++) {
12662 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012663 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012665 case '\'': squote++; break;
12666 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012668 incr = 2;
12669 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 default:
12671 /* Fast-path ASCII */
12672 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012673 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012675 ;
12676 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012679 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012681 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012683 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012685 if (osize > PY_SSIZE_T_MAX - incr) {
12686 PyErr_SetString(PyExc_OverflowError,
12687 "string is too long to generate repr");
12688 return NULL;
12689 }
12690 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 }
12692
12693 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012694 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012696 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 if (dquote)
12698 /* Both squote and dquote present. Use squote,
12699 and escape them */
12700 osize += squote;
12701 else
12702 quote = '"';
12703 }
Victor Stinner55c08782013-04-14 18:45:39 +020012704 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705
12706 repr = PyUnicode_New(osize, max);
12707 if (repr == NULL)
12708 return NULL;
12709 okind = PyUnicode_KIND(repr);
12710 odata = PyUnicode_DATA(repr);
12711
12712 PyUnicode_WRITE(okind, odata, 0, quote);
12713 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012714 if (unchanged) {
12715 _PyUnicode_FastCopyCharacters(repr, 1,
12716 unicode, 0,
12717 isize);
12718 }
12719 else {
12720 for (i = 0, o = 1; i < isize; i++) {
12721 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722
Victor Stinner55c08782013-04-14 18:45:39 +020012723 /* Escape quotes and backslashes */
12724 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012725 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012727 continue;
12728 }
12729
12730 /* Map special whitespace to '\t', \n', '\r' */
12731 if (ch == '\t') {
12732 PyUnicode_WRITE(okind, odata, o++, '\\');
12733 PyUnicode_WRITE(okind, odata, o++, 't');
12734 }
12735 else if (ch == '\n') {
12736 PyUnicode_WRITE(okind, odata, o++, '\\');
12737 PyUnicode_WRITE(okind, odata, o++, 'n');
12738 }
12739 else if (ch == '\r') {
12740 PyUnicode_WRITE(okind, odata, o++, '\\');
12741 PyUnicode_WRITE(okind, odata, o++, 'r');
12742 }
12743
12744 /* Map non-printable US ASCII to '\xhh' */
12745 else if (ch < ' ' || ch == 0x7F) {
12746 PyUnicode_WRITE(okind, odata, o++, '\\');
12747 PyUnicode_WRITE(okind, odata, o++, 'x');
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12750 }
12751
12752 /* Copy ASCII characters as-is */
12753 else if (ch < 0x7F) {
12754 PyUnicode_WRITE(okind, odata, o++, ch);
12755 }
12756
12757 /* Non-ASCII characters */
12758 else {
12759 /* Map Unicode whitespace and control characters
12760 (categories Z* and C* except ASCII space)
12761 */
12762 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12763 PyUnicode_WRITE(okind, odata, o++, '\\');
12764 /* Map 8-bit characters to '\xhh' */
12765 if (ch <= 0xff) {
12766 PyUnicode_WRITE(okind, odata, o++, 'x');
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12769 }
12770 /* Map 16-bit characters to '\uxxxx' */
12771 else if (ch <= 0xffff) {
12772 PyUnicode_WRITE(okind, odata, o++, 'u');
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12777 }
12778 /* Map 21-bit characters to '\U00xxxxxx' */
12779 else {
12780 PyUnicode_WRITE(okind, odata, o++, 'U');
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12783 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12784 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12785 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12786 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12787 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12788 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12789 }
12790 }
12791 /* Copy characters as-is */
12792 else {
12793 PyUnicode_WRITE(okind, odata, o++, ch);
12794 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012795 }
12796 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012799 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012800 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801}
12802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012803PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805\n\
12806Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012807such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808arguments start and end are interpreted as in slice notation.\n\
12809\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012810Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811
12812static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012815 /* initialize variables to prevent gcc warning */
12816 PyObject *substring = NULL;
12817 Py_ssize_t start = 0;
12818 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012821 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012824 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012827 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 if (result == -2)
12830 return NULL;
12831
Christian Heimes217cfd12007-12-02 14:31:20 +000012832 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833}
12834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012835PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012838Return the highest index in S where substring sub is found,\n\
12839such that sub is contained within S[start:end]. Optional\n\
12840arguments start and end are interpreted as in slice notation.\n\
12841\n\
12842Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843
12844static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012847 /* initialize variables to prevent gcc warning */
12848 PyObject *substring = NULL;
12849 Py_ssize_t start = 0;
12850 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012851 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012853 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012856 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012859 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 if (result == -2)
12862 return NULL;
12863
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864 if (result < 0) {
12865 PyErr_SetString(PyExc_ValueError, "substring not found");
12866 return NULL;
12867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868
Christian Heimes217cfd12007-12-02 14:31:20 +000012869 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870}
12871
INADA Naoki3ae20562017-01-16 20:41:20 +090012872/*[clinic input]
12873str.rjust as unicode_rjust
12874
12875 width: Py_ssize_t
12876 fillchar: Py_UCS4 = ' '
12877 /
12878
12879Return a right-justified string of length width.
12880
12881Padding is done using the specified fill character (default is a space).
12882[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883
12884static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012885unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12886/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012888 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889 return NULL;
12890
Victor Stinnerc4b49542011-12-11 22:44:26 +010012891 if (PyUnicode_GET_LENGTH(self) >= width)
12892 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
Victor Stinnerc4b49542011-12-11 22:44:26 +010012894 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895}
12896
Alexander Belopolsky40018472011-02-26 01:02:56 +000012897PyObject *
12898PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012900 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012903 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904}
12905
INADA Naoki3ae20562017-01-16 20:41:20 +090012906/*[clinic input]
12907str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908
INADA Naoki3ae20562017-01-16 20:41:20 +090012909 sep: object = None
12910 The delimiter according which to split the string.
12911 None (the default value) means split according to any whitespace,
12912 and discard empty strings from the result.
12913 maxsplit: Py_ssize_t = -1
12914 Maximum number of splits to do.
12915 -1 (the default value) means no limit.
12916
12917Return a list of the words in the string, using sep as the delimiter string.
12918[clinic start generated code]*/
12919
12920static PyObject *
12921unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12922/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923{
INADA Naoki3ae20562017-01-16 20:41:20 +090012924 if (sep == Py_None)
12925 return split(self, NULL, maxsplit);
12926 if (PyUnicode_Check(sep))
12927 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928
12929 PyErr_Format(PyExc_TypeError,
12930 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012931 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933}
12934
Thomas Wouters477c8d52006-05-27 19:21:47 +000012935PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012936PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012938 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012939 int kind1, kind2;
12940 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012942
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012943 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012945
Victor Stinner14f8f022011-10-05 20:58:25 +020012946 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 len1 = PyUnicode_GET_LENGTH(str_obj);
12949 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012950 if (kind1 < kind2 || len1 < len2) {
12951 _Py_INCREF_UNICODE_EMPTY();
12952 if (!unicode_empty)
12953 out = NULL;
12954 else {
12955 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12956 Py_DECREF(unicode_empty);
12957 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012958 return out;
12959 }
12960 buf1 = PyUnicode_DATA(str_obj);
12961 buf2 = PyUnicode_DATA(sep_obj);
12962 if (kind2 != kind1) {
12963 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12964 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012965 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012968 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012970 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12971 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12972 else
12973 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 break;
12975 case PyUnicode_2BYTE_KIND:
12976 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12977 break;
12978 case PyUnicode_4BYTE_KIND:
12979 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12980 break;
12981 default:
12982 assert(0);
12983 out = 0;
12984 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988
12989 return out;
12990}
12991
12992
12993PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012994PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012997 int kind1, kind2;
12998 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013000
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013001 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013003
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013004 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 len1 = PyUnicode_GET_LENGTH(str_obj);
13007 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 if (kind1 < kind2 || len1 < len2) {
13009 _Py_INCREF_UNICODE_EMPTY();
13010 if (!unicode_empty)
13011 out = NULL;
13012 else {
13013 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13014 Py_DECREF(unicode_empty);
13015 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013016 return out;
13017 }
13018 buf1 = PyUnicode_DATA(str_obj);
13019 buf2 = PyUnicode_DATA(sep_obj);
13020 if (kind2 != kind1) {
13021 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13022 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013023 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013026 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013028 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13029 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030 else
13031 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 break;
13033 case PyUnicode_2BYTE_KIND:
13034 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13035 break;
13036 case PyUnicode_4BYTE_KIND:
13037 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13038 break;
13039 default:
13040 assert(0);
13041 out = 0;
13042 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013043
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013044 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013046
13047 return out;
13048}
13049
INADA Naoki3ae20562017-01-16 20:41:20 +090013050/*[clinic input]
13051str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052
INADA Naoki3ae20562017-01-16 20:41:20 +090013053 sep: object
13054 /
13055
13056Partition the string into three parts using the given separator.
13057
13058This will search for the separator in the string. If the separator is found,
13059returns a 3-tuple containing the part before the separator, the separator
13060itself, and the part after it.
13061
13062If the separator is not found, returns a 3-tuple containing the original string
13063and two empty strings.
13064[clinic start generated code]*/
13065
13066static PyObject *
13067unicode_partition(PyObject *self, PyObject *sep)
13068/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013069{
INADA Naoki3ae20562017-01-16 20:41:20 +090013070 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013071}
13072
INADA Naoki3ae20562017-01-16 20:41:20 +090013073/*[clinic input]
13074str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013075
INADA Naoki3ae20562017-01-16 20:41:20 +090013076Partition the string into three parts using the given separator.
13077
13078This will search for the separator in the string, starting and the end. If
13079the separator is found, returns a 3-tuple containing the part before the
13080separator, the separator itself, and the part after it.
13081
13082If the separator is not found, returns a 3-tuple containing two empty strings
13083and the original string.
13084[clinic start generated code]*/
13085
13086static PyObject *
13087unicode_rpartition(PyObject *self, PyObject *sep)
13088/*[clinic end generated code: output=1aa13cf1156572aa input=e77c7acb69bdfca6]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013089{
INADA Naoki3ae20562017-01-16 20:41:20 +090013090 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013091}
13092
Alexander Belopolsky40018472011-02-26 01:02:56 +000013093PyObject *
13094PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013095{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013096 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013097 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013098
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013099 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013100}
13101
INADA Naoki3ae20562017-01-16 20:41:20 +090013102/*[clinic input]
13103str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013104
INADA Naoki3ae20562017-01-16 20:41:20 +090013105Return a list of the words in the string, using sep as the delimiter string.
13106
13107Splits are done starting at the end of the string and working to the front.
13108[clinic start generated code]*/
13109
13110static PyObject *
13111unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13112/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013113{
INADA Naoki3ae20562017-01-16 20:41:20 +090013114 if (sep == Py_None)
13115 return rsplit(self, NULL, maxsplit);
13116 if (PyUnicode_Check(sep))
13117 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013118
13119 PyErr_Format(PyExc_TypeError,
13120 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013121 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013122 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013123}
13124
INADA Naoki3ae20562017-01-16 20:41:20 +090013125/*[clinic input]
13126str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013128 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013129
13130Return a list of the lines in the string, breaking at line boundaries.
13131
13132Line breaks are not included in the resulting list unless keepends is given and
13133true.
13134[clinic start generated code]*/
13135
13136static PyObject *
13137unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013138/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013140 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141}
13142
13143static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013144PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013146 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147}
13148
INADA Naoki3ae20562017-01-16 20:41:20 +090013149/*[clinic input]
13150str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151
INADA Naoki3ae20562017-01-16 20:41:20 +090013152Convert uppercase characters to lowercase and lowercase characters to uppercase.
13153[clinic start generated code]*/
13154
13155static PyObject *
13156unicode_swapcase_impl(PyObject *self)
13157/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013159 if (PyUnicode_READY(self) == -1)
13160 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013161 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162}
13163
Larry Hastings61272b72014-01-07 12:41:53 -080013164/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013165
Larry Hastings31826802013-10-19 00:09:25 -070013166@staticmethod
13167str.maketrans as unicode_maketrans
13168
13169 x: object
13170
13171 y: unicode=NULL
13172
13173 z: unicode=NULL
13174
13175 /
13176
13177Return a translation table usable for str.translate().
13178
13179If there is only one argument, it must be a dictionary mapping Unicode
13180ordinals (integers) or characters to Unicode ordinals, strings or None.
13181Character keys will be then converted to ordinals.
13182If there are two arguments, they must be strings of equal length, and
13183in the resulting dictionary, each character in x will be mapped to the
13184character at the same position in y. If there is a third argument, it
13185must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013186[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013187
Larry Hastings31826802013-10-19 00:09:25 -070013188static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013189unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013190/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013191{
Georg Brandlceee0772007-11-27 23:48:05 +000013192 PyObject *new = NULL, *key, *value;
13193 Py_ssize_t i = 0;
13194 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195
Georg Brandlceee0772007-11-27 23:48:05 +000013196 new = PyDict_New();
13197 if (!new)
13198 return NULL;
13199 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 int x_kind, y_kind, z_kind;
13201 void *x_data, *y_data, *z_data;
13202
Georg Brandlceee0772007-11-27 23:48:05 +000013203 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013204 if (!PyUnicode_Check(x)) {
13205 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13206 "be a string if there is a second argument");
13207 goto err;
13208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013210 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13211 "arguments must have equal length");
13212 goto err;
13213 }
13214 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 x_kind = PyUnicode_KIND(x);
13216 y_kind = PyUnicode_KIND(y);
13217 x_data = PyUnicode_DATA(x);
13218 y_data = PyUnicode_DATA(y);
13219 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13220 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013221 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013222 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013223 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013224 if (!value) {
13225 Py_DECREF(key);
13226 goto err;
13227 }
Georg Brandlceee0772007-11-27 23:48:05 +000013228 res = PyDict_SetItem(new, key, value);
13229 Py_DECREF(key);
13230 Py_DECREF(value);
13231 if (res < 0)
13232 goto err;
13233 }
13234 /* create entries for deleting chars in z */
13235 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 z_kind = PyUnicode_KIND(z);
13237 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013238 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013240 if (!key)
13241 goto err;
13242 res = PyDict_SetItem(new, key, Py_None);
13243 Py_DECREF(key);
13244 if (res < 0)
13245 goto err;
13246 }
13247 }
13248 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 int kind;
13250 void *data;
13251
Georg Brandlceee0772007-11-27 23:48:05 +000013252 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013253 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013254 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13255 "to maketrans it must be a dict");
13256 goto err;
13257 }
13258 /* copy entries into the new dict, converting string keys to int keys */
13259 while (PyDict_Next(x, &i, &key, &value)) {
13260 if (PyUnicode_Check(key)) {
13261 /* convert string keys to integer keys */
13262 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013263 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013264 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13265 "table must be of length 1");
13266 goto err;
13267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 kind = PyUnicode_KIND(key);
13269 data = PyUnicode_DATA(key);
13270 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013271 if (!newkey)
13272 goto err;
13273 res = PyDict_SetItem(new, newkey, value);
13274 Py_DECREF(newkey);
13275 if (res < 0)
13276 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013277 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013278 /* just keep integer keys */
13279 if (PyDict_SetItem(new, key, value) < 0)
13280 goto err;
13281 } else {
13282 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13283 "be strings or integers");
13284 goto err;
13285 }
13286 }
13287 }
13288 return new;
13289 err:
13290 Py_DECREF(new);
13291 return NULL;
13292}
13293
INADA Naoki3ae20562017-01-16 20:41:20 +090013294/*[clinic input]
13295str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
INADA Naoki3ae20562017-01-16 20:41:20 +090013297 table: object
13298 Translation table, which must be a mapping of Unicode ordinals to
13299 Unicode ordinals, strings, or None.
13300 /
13301
13302Replace each character in the string using the given translation table.
13303
13304The table must implement lookup/indexing via __getitem__, for instance a
13305dictionary or list. If this operation raises LookupError, the character is
13306left untouched. Characters mapped to None are deleted.
13307[clinic start generated code]*/
13308
13309static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013311/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314}
13315
INADA Naoki3ae20562017-01-16 20:41:20 +090013316/*[clinic input]
13317str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318
INADA Naoki3ae20562017-01-16 20:41:20 +090013319Return a copy of the string converted to uppercase.
13320[clinic start generated code]*/
13321
13322static PyObject *
13323unicode_upper_impl(PyObject *self)
13324/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013326 if (PyUnicode_READY(self) == -1)
13327 return NULL;
13328 if (PyUnicode_IS_ASCII(self))
13329 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013330 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331}
13332
INADA Naoki3ae20562017-01-16 20:41:20 +090013333/*[clinic input]
13334str.zfill as unicode_zfill
13335
13336 width: Py_ssize_t
13337 /
13338
13339Pad a numeric string with zeros on the left, to fill a field of the given width.
13340
13341The string is never truncated.
13342[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343
13344static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013345unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013346/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013348 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013349 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 int kind;
13351 void *data;
13352 Py_UCS4 chr;
13353
Benjamin Petersonbac79492012-01-14 13:34:47 -050013354 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356
Victor Stinnerc4b49542011-12-11 22:44:26 +010013357 if (PyUnicode_GET_LENGTH(self) >= width)
13358 return unicode_result_unchanged(self);
13359
13360 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361
13362 u = pad(self, fill, 0, '0');
13363
Walter Dörwald068325e2002-04-15 13:36:47 +000013364 if (u == NULL)
13365 return NULL;
13366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367 kind = PyUnicode_KIND(u);
13368 data = PyUnicode_DATA(u);
13369 chr = PyUnicode_READ(kind, data, fill);
13370
13371 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013373 PyUnicode_WRITE(kind, data, 0, chr);
13374 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375 }
13376
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013377 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013378 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380
13381#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013382static PyObject *
13383unicode__decimal2ascii(PyObject *self)
13384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013386}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387#endif
13388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013389PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013392Return True if S starts with the specified prefix, False otherwise.\n\
13393With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013394With optional end, stop comparing S at that position.\n\
13395prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396
13397static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013398unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013400{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013402 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013403 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013404 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013405 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406
Jesus Ceaac451502011-04-20 17:09:23 +020013407 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013409 if (PyTuple_Check(subobj)) {
13410 Py_ssize_t i;
13411 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013412 substring = PyTuple_GET_ITEM(subobj, i);
13413 if (!PyUnicode_Check(substring)) {
13414 PyErr_Format(PyExc_TypeError,
13415 "tuple for startswith must only contain str, "
13416 "not %.100s",
13417 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013418 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013419 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013420 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013421 if (result == -1)
13422 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013423 if (result) {
13424 Py_RETURN_TRUE;
13425 }
13426 }
13427 /* nothing matched */
13428 Py_RETURN_FALSE;
13429 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013430 if (!PyUnicode_Check(subobj)) {
13431 PyErr_Format(PyExc_TypeError,
13432 "startswith first arg must be str or "
13433 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013435 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013436 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013437 if (result == -1)
13438 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013439 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440}
13441
13442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013443PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013446Return True if S ends with the specified suffix, False otherwise.\n\
13447With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013448With optional end, stop comparing S at that position.\n\
13449suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450
13451static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013452unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013455 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013456 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013457 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013458 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013459 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460
Jesus Ceaac451502011-04-20 17:09:23 +020013461 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013463 if (PyTuple_Check(subobj)) {
13464 Py_ssize_t i;
13465 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013466 substring = PyTuple_GET_ITEM(subobj, i);
13467 if (!PyUnicode_Check(substring)) {
13468 PyErr_Format(PyExc_TypeError,
13469 "tuple for endswith must only contain str, "
13470 "not %.100s",
13471 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013473 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013474 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013475 if (result == -1)
13476 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013477 if (result) {
13478 Py_RETURN_TRUE;
13479 }
13480 }
13481 Py_RETURN_FALSE;
13482 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013483 if (!PyUnicode_Check(subobj)) {
13484 PyErr_Format(PyExc_TypeError,
13485 "endswith first arg must be str or "
13486 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013487 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013488 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013489 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013490 if (result == -1)
13491 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013492 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013493}
13494
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013495static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013496_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013497{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013498 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13499 writer->data = PyUnicode_DATA(writer->buffer);
13500
13501 if (!writer->readonly) {
13502 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013503 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013504 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013505 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013506 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13507 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13508 writer->kind = PyUnicode_WCHAR_KIND;
13509 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13510
Victor Stinner8f674cc2013-04-17 23:02:17 +020013511 /* Copy-on-write mode: set buffer size to 0 so
13512 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13513 * next write. */
13514 writer->size = 0;
13515 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013516}
13517
Victor Stinnerd3f08822012-05-29 12:57:52 +020013518void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013519_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013520{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013522
13523 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013524 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013525
13526 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13527 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13528 writer->kind = PyUnicode_WCHAR_KIND;
13529 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013530}
13531
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532int
13533_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13534 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013535{
13536 Py_ssize_t newlen;
13537 PyObject *newbuffer;
13538
Victor Stinner2740e462016-09-06 16:58:36 -070013539 assert(maxchar <= MAX_UNICODE);
13540
Victor Stinnerca9381e2015-09-22 00:58:32 +020013541 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013542 assert((maxchar > writer->maxchar && length >= 0)
13543 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544
Victor Stinner202fdca2012-05-07 12:47:02 +020013545 if (length > PY_SSIZE_T_MAX - writer->pos) {
13546 PyErr_NoMemory();
13547 return -1;
13548 }
13549 newlen = writer->pos + length;
13550
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013551 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013554 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013555 if (writer->overallocate
13556 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13557 /* overallocate to limit the number of realloc() */
13558 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013559 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013560 if (newlen < writer->min_length)
13561 newlen = writer->min_length;
13562
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 writer->buffer = PyUnicode_New(newlen, maxchar);
13564 if (writer->buffer == NULL)
13565 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013566 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013568 if (writer->overallocate
13569 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13570 /* overallocate to limit the number of realloc() */
13571 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013573 if (newlen < writer->min_length)
13574 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013575
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013576 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013577 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013578 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013579 newbuffer = PyUnicode_New(newlen, maxchar);
13580 if (newbuffer == NULL)
13581 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013582 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13583 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013585 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013586 }
13587 else {
13588 newbuffer = resize_compact(writer->buffer, newlen);
13589 if (newbuffer == NULL)
13590 return -1;
13591 }
13592 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013593 }
13594 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013595 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596 newbuffer = PyUnicode_New(writer->size, maxchar);
13597 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013598 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013599 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13600 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013601 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013602 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013603 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013604 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013605
13606#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013607}
13608
Victor Stinnerca9381e2015-09-22 00:58:32 +020013609int
13610_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13611 enum PyUnicode_Kind kind)
13612{
13613 Py_UCS4 maxchar;
13614
13615 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13616 assert(writer->kind < kind);
13617
13618 switch (kind)
13619 {
13620 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13621 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13622 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13623 default:
13624 assert(0 && "invalid kind");
13625 return -1;
13626 }
13627
13628 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13629}
13630
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013631static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013632_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013633{
Victor Stinner2740e462016-09-06 16:58:36 -070013634 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013635 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13636 return -1;
13637 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13638 writer->pos++;
13639 return 0;
13640}
13641
13642int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013643_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13644{
13645 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13646}
13647
13648int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13650{
13651 Py_UCS4 maxchar;
13652 Py_ssize_t len;
13653
13654 if (PyUnicode_READY(str) == -1)
13655 return -1;
13656 len = PyUnicode_GET_LENGTH(str);
13657 if (len == 0)
13658 return 0;
13659 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13660 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013661 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013662 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013663 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013664 Py_INCREF(str);
13665 writer->buffer = str;
13666 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013667 writer->pos += len;
13668 return 0;
13669 }
13670 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13671 return -1;
13672 }
13673 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13674 str, 0, len);
13675 writer->pos += len;
13676 return 0;
13677}
13678
Victor Stinnere215d962012-10-06 23:03:36 +020013679int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013680_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13681 Py_ssize_t start, Py_ssize_t end)
13682{
13683 Py_UCS4 maxchar;
13684 Py_ssize_t len;
13685
13686 if (PyUnicode_READY(str) == -1)
13687 return -1;
13688
13689 assert(0 <= start);
13690 assert(end <= PyUnicode_GET_LENGTH(str));
13691 assert(start <= end);
13692
13693 if (end == 0)
13694 return 0;
13695
13696 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13697 return _PyUnicodeWriter_WriteStr(writer, str);
13698
13699 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13700 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13701 else
13702 maxchar = writer->maxchar;
13703 len = end - start;
13704
13705 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13706 return -1;
13707
13708 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13709 str, start, len);
13710 writer->pos += len;
13711 return 0;
13712}
13713
13714int
Victor Stinner4a587072013-11-19 12:54:53 +010013715_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13716 const char *ascii, Py_ssize_t len)
13717{
13718 if (len == -1)
13719 len = strlen(ascii);
13720
13721 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13722
13723 if (writer->buffer == NULL && !writer->overallocate) {
13724 PyObject *str;
13725
13726 str = _PyUnicode_FromASCII(ascii, len);
13727 if (str == NULL)
13728 return -1;
13729
13730 writer->readonly = 1;
13731 writer->buffer = str;
13732 _PyUnicodeWriter_Update(writer);
13733 writer->pos += len;
13734 return 0;
13735 }
13736
13737 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13738 return -1;
13739
13740 switch (writer->kind)
13741 {
13742 case PyUnicode_1BYTE_KIND:
13743 {
13744 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13745 Py_UCS1 *data = writer->data;
13746
Christian Heimesf051e432016-09-13 20:22:02 +020013747 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013748 break;
13749 }
13750 case PyUnicode_2BYTE_KIND:
13751 {
13752 _PyUnicode_CONVERT_BYTES(
13753 Py_UCS1, Py_UCS2,
13754 ascii, ascii + len,
13755 (Py_UCS2 *)writer->data + writer->pos);
13756 break;
13757 }
13758 case PyUnicode_4BYTE_KIND:
13759 {
13760 _PyUnicode_CONVERT_BYTES(
13761 Py_UCS1, Py_UCS4,
13762 ascii, ascii + len,
13763 (Py_UCS4 *)writer->data + writer->pos);
13764 break;
13765 }
13766 default:
13767 assert(0);
13768 }
13769
13770 writer->pos += len;
13771 return 0;
13772}
13773
13774int
13775_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13776 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013777{
13778 Py_UCS4 maxchar;
13779
13780 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13781 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13782 return -1;
13783 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13784 writer->pos += len;
13785 return 0;
13786}
13787
Victor Stinnerd3f08822012-05-29 12:57:52 +020013788PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013789_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013790{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013791 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013792
Victor Stinnerd3f08822012-05-29 12:57:52 +020013793 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013794 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013795 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013796 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013797
13798 str = writer->buffer;
13799 writer->buffer = NULL;
13800
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013801 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013802 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13803 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013804 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013805
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013806 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13807 PyObject *str2;
13808 str2 = resize_compact(str, writer->pos);
13809 if (str2 == NULL) {
13810 Py_DECREF(str);
13811 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013812 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013813 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013814 }
13815
Victor Stinner15a0bd32013-07-08 22:29:55 +020013816 assert(_PyUnicode_CheckConsistency(str, 1));
13817 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013818}
13819
Victor Stinnerd3f08822012-05-29 12:57:52 +020013820void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013821_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013822{
13823 Py_CLEAR(writer->buffer);
13824}
13825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013826#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013827
13828PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013830\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013831Return a formatted version of S, using substitutions from args and kwargs.\n\
13832The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013833
Eric Smith27bbca62010-11-04 17:06:58 +000013834PyDoc_STRVAR(format_map__doc__,
13835 "S.format_map(mapping) -> str\n\
13836\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013837Return a formatted version of S, using substitutions from mapping.\n\
13838The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013839
INADA Naoki3ae20562017-01-16 20:41:20 +090013840/*[clinic input]
13841str.__format__ as unicode___format__
13842
13843 format_spec: unicode
13844 /
13845
13846Return a formatted version of the string as described by format_spec.
13847[clinic start generated code]*/
13848
Eric Smith4a7d76d2008-05-30 18:10:19 +000013849static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013850unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013851/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013852{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013853 _PyUnicodeWriter writer;
13854 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013855
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 if (PyUnicode_READY(self) == -1)
13857 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013858 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013859 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13860 self, format_spec, 0,
13861 PyUnicode_GET_LENGTH(format_spec));
13862 if (ret == -1) {
13863 _PyUnicodeWriter_Dealloc(&writer);
13864 return NULL;
13865 }
13866 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013867}
13868
INADA Naoki3ae20562017-01-16 20:41:20 +090013869/*[clinic input]
13870str.__sizeof__ as unicode_sizeof
13871
13872Return the size of the string in memory, in bytes.
13873[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013874
13875static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013876unicode_sizeof_impl(PyObject *self)
13877/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013879 Py_ssize_t size;
13880
13881 /* If it's a compact object, account for base structure +
13882 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013883 if (PyUnicode_IS_COMPACT_ASCII(self))
13884 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13885 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013887 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 else {
13889 /* If it is a two-block object, account for base object, and
13890 for character block if present. */
13891 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013892 if (_PyUnicode_DATA_ANY(self))
13893 size += (PyUnicode_GET_LENGTH(self) + 1) *
13894 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 }
13896 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013897 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013898 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13899 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13900 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13901 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902
13903 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013904}
13905
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013906static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013907unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013908{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013909 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013910 if (!copy)
13911 return NULL;
13912 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013913}
13914
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013916 UNICODE_ENCODE_METHODDEF
13917 UNICODE_REPLACE_METHODDEF
13918 UNICODE_SPLIT_METHODDEF
13919 UNICODE_RSPLIT_METHODDEF
13920 UNICODE_JOIN_METHODDEF
13921 UNICODE_CAPITALIZE_METHODDEF
13922 UNICODE_CASEFOLD_METHODDEF
13923 UNICODE_TITLE_METHODDEF
13924 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013926 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013927 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013928 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013929 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013930 UNICODE_LJUST_METHODDEF
13931 UNICODE_LOWER_METHODDEF
13932 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013933 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13934 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013935 UNICODE_RJUST_METHODDEF
13936 UNICODE_RSTRIP_METHODDEF
13937 UNICODE_RPARTITION_METHODDEF
13938 UNICODE_SPLITLINES_METHODDEF
13939 UNICODE_STRIP_METHODDEF
13940 UNICODE_SWAPCASE_METHODDEF
13941 UNICODE_TRANSLATE_METHODDEF
13942 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013943 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13944 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013945 UNICODE_ISLOWER_METHODDEF
13946 UNICODE_ISUPPER_METHODDEF
13947 UNICODE_ISTITLE_METHODDEF
13948 UNICODE_ISSPACE_METHODDEF
13949 UNICODE_ISDECIMAL_METHODDEF
13950 UNICODE_ISDIGIT_METHODDEF
13951 UNICODE_ISNUMERIC_METHODDEF
13952 UNICODE_ISALPHA_METHODDEF
13953 UNICODE_ISALNUM_METHODDEF
13954 UNICODE_ISIDENTIFIER_METHODDEF
13955 UNICODE_ISPRINTABLE_METHODDEF
13956 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013957 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013958 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013959 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013960 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013961 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013962#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013963 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013964 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013965#endif
13966
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968 {NULL, NULL}
13969};
13970
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013971static PyObject *
13972unicode_mod(PyObject *v, PyObject *w)
13973{
Brian Curtindfc80e32011-08-10 20:28:54 -050013974 if (!PyUnicode_Check(v))
13975 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013976 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013977}
13978
13979static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 0, /*nb_add*/
13981 0, /*nb_subtract*/
13982 0, /*nb_multiply*/
13983 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013984};
13985
Guido van Rossumd57fd912000-03-10 22:53:23 +000013986static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013987 (lenfunc) unicode_length, /* sq_length */
13988 PyUnicode_Concat, /* sq_concat */
13989 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13990 (ssizeargfunc) unicode_getitem, /* sq_item */
13991 0, /* sq_slice */
13992 0, /* sq_ass_item */
13993 0, /* sq_ass_slice */
13994 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995};
13996
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013997static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013998unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014000 if (PyUnicode_READY(self) == -1)
14001 return NULL;
14002
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014003 if (PyIndex_Check(item)) {
14004 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014005 if (i == -1 && PyErr_Occurred())
14006 return NULL;
14007 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014008 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014009 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014010 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000014011 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014012 PyObject *result;
14013 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014015 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014016
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014017 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014018 return NULL;
14019 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014020 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14021 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014022
14023 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014024 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014025 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014026 slicelength == PyUnicode_GET_LENGTH(self)) {
14027 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014028 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014029 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014030 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014031 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014032 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014033 src_kind = PyUnicode_KIND(self);
14034 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014035 if (!PyUnicode_IS_ASCII(self)) {
14036 kind_limit = kind_maxchar_limit(src_kind);
14037 max_char = 0;
14038 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14039 ch = PyUnicode_READ(src_kind, src_data, cur);
14040 if (ch > max_char) {
14041 max_char = ch;
14042 if (max_char >= kind_limit)
14043 break;
14044 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014045 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014046 }
Victor Stinner55c99112011-10-13 01:17:06 +020014047 else
14048 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014049 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014050 if (result == NULL)
14051 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014052 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014053 dest_data = PyUnicode_DATA(result);
14054
14055 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014056 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14057 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014058 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014059 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014060 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014061 } else {
14062 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14063 return NULL;
14064 }
14065}
14066
14067static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014068 (lenfunc)unicode_length, /* mp_length */
14069 (binaryfunc)unicode_subscript, /* mp_subscript */
14070 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014071};
14072
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073
Guido van Rossumd57fd912000-03-10 22:53:23 +000014074/* Helpers for PyUnicode_Format() */
14075
Victor Stinnera47082312012-10-04 02:19:54 +020014076struct unicode_formatter_t {
14077 PyObject *args;
14078 int args_owned;
14079 Py_ssize_t arglen, argidx;
14080 PyObject *dict;
14081
14082 enum PyUnicode_Kind fmtkind;
14083 Py_ssize_t fmtcnt, fmtpos;
14084 void *fmtdata;
14085 PyObject *fmtstr;
14086
14087 _PyUnicodeWriter writer;
14088};
14089
14090struct unicode_format_arg_t {
14091 Py_UCS4 ch;
14092 int flags;
14093 Py_ssize_t width;
14094 int prec;
14095 int sign;
14096};
14097
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014099unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100{
Victor Stinnera47082312012-10-04 02:19:54 +020014101 Py_ssize_t argidx = ctx->argidx;
14102
14103 if (argidx < ctx->arglen) {
14104 ctx->argidx++;
14105 if (ctx->arglen < 0)
14106 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014107 else
Victor Stinnera47082312012-10-04 02:19:54 +020014108 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014109 }
14110 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014111 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014112 return NULL;
14113}
14114
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014115/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014116
Victor Stinnera47082312012-10-04 02:19:54 +020014117/* Format a float into the writer if the writer is not NULL, or into *p_output
14118 otherwise.
14119
14120 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014121static int
Victor Stinnera47082312012-10-04 02:19:54 +020014122formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14123 PyObject **p_output,
14124 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014125{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014126 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014127 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014128 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014129 int prec;
14130 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014131
Guido van Rossumd57fd912000-03-10 22:53:23 +000014132 x = PyFloat_AsDouble(v);
14133 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014134 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014135
Victor Stinnera47082312012-10-04 02:19:54 +020014136 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014137 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014138 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014139
Victor Stinnera47082312012-10-04 02:19:54 +020014140 if (arg->flags & F_ALT)
14141 dtoa_flags = Py_DTSF_ALT;
14142 else
14143 dtoa_flags = 0;
14144 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014145 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014146 return -1;
14147 len = strlen(p);
14148 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014149 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014150 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014151 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014152 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014153 }
14154 else
14155 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014156 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014157 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014158}
14159
Victor Stinnerd0880d52012-04-27 23:40:13 +020014160/* formatlong() emulates the format codes d, u, o, x and X, and
14161 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14162 * Python's regular ints.
14163 * Return value: a new PyUnicodeObject*, or NULL if error.
14164 * The output string is of the form
14165 * "-"? ("0x" | "0X")? digit+
14166 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14167 * set in flags. The case of hex digits will be correct,
14168 * There will be at least prec digits, zero-filled on the left if
14169 * necessary to get that many.
14170 * val object to be converted
14171 * flags bitmask of format flags; only F_ALT is looked at
14172 * prec minimum number of digits; 0-fill on left if needed
14173 * type a character in [duoxX]; u acts the same as d
14174 *
14175 * CAUTION: o, x and X conversions on regular ints can never
14176 * produce a '-' sign, but can for Python's unbounded ints.
14177 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014178PyObject *
14179_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014180{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014182 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014183 Py_ssize_t i;
14184 int sign; /* 1 if '-', else 0 */
14185 int len; /* number of characters */
14186 Py_ssize_t llen;
14187 int numdigits; /* len == numnondigits + numdigits */
14188 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014189
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 /* Avoid exceeding SSIZE_T_MAX */
14191 if (prec > INT_MAX-3) {
14192 PyErr_SetString(PyExc_OverflowError,
14193 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014195 }
14196
14197 assert(PyLong_Check(val));
14198
14199 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014200 default:
14201 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014202 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014203 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014204 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014205 /* int and int subclasses should print numerically when a numeric */
14206 /* format code is used (see issue18780) */
14207 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014208 break;
14209 case 'o':
14210 numnondigits = 2;
14211 result = PyNumber_ToBase(val, 8);
14212 break;
14213 case 'x':
14214 case 'X':
14215 numnondigits = 2;
14216 result = PyNumber_ToBase(val, 16);
14217 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014218 }
14219 if (!result)
14220 return NULL;
14221
14222 assert(unicode_modifiable(result));
14223 assert(PyUnicode_IS_READY(result));
14224 assert(PyUnicode_IS_ASCII(result));
14225
14226 /* To modify the string in-place, there can only be one reference. */
14227 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014228 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014229 PyErr_BadInternalCall();
14230 return NULL;
14231 }
14232 buf = PyUnicode_DATA(result);
14233 llen = PyUnicode_GET_LENGTH(result);
14234 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014235 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014236 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014237 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014238 return NULL;
14239 }
14240 len = (int)llen;
14241 sign = buf[0] == '-';
14242 numnondigits += sign;
14243 numdigits = len - numnondigits;
14244 assert(numdigits > 0);
14245
14246 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014247 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014248 (type == 'o' || type == 'x' || type == 'X'))) {
14249 assert(buf[sign] == '0');
14250 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14251 buf[sign+1] == 'o');
14252 numnondigits -= 2;
14253 buf += 2;
14254 len -= 2;
14255 if (sign)
14256 buf[0] = '-';
14257 assert(len == numnondigits + numdigits);
14258 assert(numdigits > 0);
14259 }
14260
14261 /* Fill with leading zeroes to meet minimum width. */
14262 if (prec > numdigits) {
14263 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14264 numnondigits + prec);
14265 char *b1;
14266 if (!r1) {
14267 Py_DECREF(result);
14268 return NULL;
14269 }
14270 b1 = PyBytes_AS_STRING(r1);
14271 for (i = 0; i < numnondigits; ++i)
14272 *b1++ = *buf++;
14273 for (i = 0; i < prec - numdigits; i++)
14274 *b1++ = '0';
14275 for (i = 0; i < numdigits; i++)
14276 *b1++ = *buf++;
14277 *b1 = '\0';
14278 Py_DECREF(result);
14279 result = r1;
14280 buf = PyBytes_AS_STRING(result);
14281 len = numnondigits + prec;
14282 }
14283
14284 /* Fix up case for hex conversions. */
14285 if (type == 'X') {
14286 /* Need to convert all lower case letters to upper case.
14287 and need to convert 0x to 0X (and -0x to -0X). */
14288 for (i = 0; i < len; i++)
14289 if (buf[i] >= 'a' && buf[i] <= 'x')
14290 buf[i] -= 'a'-'A';
14291 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014292 if (!PyUnicode_Check(result)
14293 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014294 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014295 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014296 Py_DECREF(result);
14297 result = unicode;
14298 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299 else if (len != PyUnicode_GET_LENGTH(result)) {
14300 if (PyUnicode_Resize(&result, len) < 0)
14301 Py_CLEAR(result);
14302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014304}
14305
Ethan Furmandf3ed242014-01-05 06:50:30 -080014306/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014308 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014309 * -1 and raise an exception on error */
14310static int
Victor Stinnera47082312012-10-04 02:19:54 +020014311mainformatlong(PyObject *v,
14312 struct unicode_format_arg_t *arg,
14313 PyObject **p_output,
14314 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315{
14316 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014317 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014318
14319 if (!PyNumber_Check(v))
14320 goto wrongtype;
14321
Ethan Furman9ab74802014-03-21 06:38:46 -070014322 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014323 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014324 if (type == 'o' || type == 'x' || type == 'X') {
14325 iobj = PyNumber_Index(v);
14326 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014327 if (PyErr_ExceptionMatches(PyExc_TypeError))
14328 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014329 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014330 }
14331 }
14332 else {
14333 iobj = PyNumber_Long(v);
14334 if (iobj == NULL ) {
14335 if (PyErr_ExceptionMatches(PyExc_TypeError))
14336 goto wrongtype;
14337 return -1;
14338 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014339 }
14340 assert(PyLong_Check(iobj));
14341 }
14342 else {
14343 iobj = v;
14344 Py_INCREF(iobj);
14345 }
14346
14347 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014348 && arg->width == -1 && arg->prec == -1
14349 && !(arg->flags & (F_SIGN | F_BLANK))
14350 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 {
14352 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014353 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014354 int base;
14355
Victor Stinnera47082312012-10-04 02:19:54 +020014356 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014357 {
14358 default:
14359 assert(0 && "'type' not in [diuoxX]");
14360 case 'd':
14361 case 'i':
14362 case 'u':
14363 base = 10;
14364 break;
14365 case 'o':
14366 base = 8;
14367 break;
14368 case 'x':
14369 case 'X':
14370 base = 16;
14371 break;
14372 }
14373
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014374 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14375 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014376 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014377 }
14378 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379 return 1;
14380 }
14381
Ethan Furmanb95b5612015-01-23 20:05:18 -080014382 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014383 Py_DECREF(iobj);
14384 if (res == NULL)
14385 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014386 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014387 return 0;
14388
14389wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014390 switch(type)
14391 {
14392 case 'o':
14393 case 'x':
14394 case 'X':
14395 PyErr_Format(PyExc_TypeError,
14396 "%%%c format: an integer is required, "
14397 "not %.200s",
14398 type, Py_TYPE(v)->tp_name);
14399 break;
14400 default:
14401 PyErr_Format(PyExc_TypeError,
14402 "%%%c format: a number is required, "
14403 "not %.200s",
14404 type, Py_TYPE(v)->tp_name);
14405 break;
14406 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014407 return -1;
14408}
14409
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014410static Py_UCS4
14411formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014413 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014414 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014415 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014416 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014418 goto onError;
14419 }
14420 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014421 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014422 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014423 /* make sure number is a type of integer */
14424 if (!PyLong_Check(v)) {
14425 iobj = PyNumber_Index(v);
14426 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014427 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014428 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014429 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014430 Py_DECREF(iobj);
14431 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014432 else {
14433 x = PyLong_AsLong(v);
14434 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014435 if (x == -1 && PyErr_Occurred())
14436 goto onError;
14437
Victor Stinner8faf8212011-12-08 22:14:11 +010014438 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 PyErr_SetString(PyExc_OverflowError,
14440 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014441 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014442 }
14443
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014444 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014446
Benjamin Peterson29060642009-01-31 22:14:21 +000014447 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014448 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014449 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014450 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014451}
14452
Victor Stinnera47082312012-10-04 02:19:54 +020014453/* Parse options of an argument: flags, width, precision.
14454 Handle also "%(name)" syntax.
14455
14456 Return 0 if the argument has been formatted into arg->str.
14457 Return 1 if the argument has been written into ctx->writer,
14458 Raise an exception and return -1 on error. */
14459static int
14460unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14461 struct unicode_format_arg_t *arg)
14462{
14463#define FORMAT_READ(ctx) \
14464 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14465
14466 PyObject *v;
14467
Victor Stinnera47082312012-10-04 02:19:54 +020014468 if (arg->ch == '(') {
14469 /* Get argument value from a dictionary. Example: "%(name)s". */
14470 Py_ssize_t keystart;
14471 Py_ssize_t keylen;
14472 PyObject *key;
14473 int pcount = 1;
14474
14475 if (ctx->dict == NULL) {
14476 PyErr_SetString(PyExc_TypeError,
14477 "format requires a mapping");
14478 return -1;
14479 }
14480 ++ctx->fmtpos;
14481 --ctx->fmtcnt;
14482 keystart = ctx->fmtpos;
14483 /* Skip over balanced parentheses */
14484 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14485 arg->ch = FORMAT_READ(ctx);
14486 if (arg->ch == ')')
14487 --pcount;
14488 else if (arg->ch == '(')
14489 ++pcount;
14490 ctx->fmtpos++;
14491 }
14492 keylen = ctx->fmtpos - keystart - 1;
14493 if (ctx->fmtcnt < 0 || pcount > 0) {
14494 PyErr_SetString(PyExc_ValueError,
14495 "incomplete format key");
14496 return -1;
14497 }
14498 key = PyUnicode_Substring(ctx->fmtstr,
14499 keystart, keystart + keylen);
14500 if (key == NULL)
14501 return -1;
14502 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014503 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014504 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014505 }
14506 ctx->args = PyObject_GetItem(ctx->dict, key);
14507 Py_DECREF(key);
14508 if (ctx->args == NULL)
14509 return -1;
14510 ctx->args_owned = 1;
14511 ctx->arglen = -1;
14512 ctx->argidx = -2;
14513 }
14514
14515 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014516 while (--ctx->fmtcnt >= 0) {
14517 arg->ch = FORMAT_READ(ctx);
14518 ctx->fmtpos++;
14519 switch (arg->ch) {
14520 case '-': arg->flags |= F_LJUST; continue;
14521 case '+': arg->flags |= F_SIGN; continue;
14522 case ' ': arg->flags |= F_BLANK; continue;
14523 case '#': arg->flags |= F_ALT; continue;
14524 case '0': arg->flags |= F_ZERO; continue;
14525 }
14526 break;
14527 }
14528
14529 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014530 if (arg->ch == '*') {
14531 v = unicode_format_getnextarg(ctx);
14532 if (v == NULL)
14533 return -1;
14534 if (!PyLong_Check(v)) {
14535 PyErr_SetString(PyExc_TypeError,
14536 "* wants int");
14537 return -1;
14538 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014539 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014540 if (arg->width == -1 && PyErr_Occurred())
14541 return -1;
14542 if (arg->width < 0) {
14543 arg->flags |= F_LJUST;
14544 arg->width = -arg->width;
14545 }
14546 if (--ctx->fmtcnt >= 0) {
14547 arg->ch = FORMAT_READ(ctx);
14548 ctx->fmtpos++;
14549 }
14550 }
14551 else if (arg->ch >= '0' && arg->ch <= '9') {
14552 arg->width = arg->ch - '0';
14553 while (--ctx->fmtcnt >= 0) {
14554 arg->ch = FORMAT_READ(ctx);
14555 ctx->fmtpos++;
14556 if (arg->ch < '0' || arg->ch > '9')
14557 break;
14558 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14559 mixing signed and unsigned comparison. Since arg->ch is between
14560 '0' and '9', casting to int is safe. */
14561 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14562 PyErr_SetString(PyExc_ValueError,
14563 "width too big");
14564 return -1;
14565 }
14566 arg->width = arg->width*10 + (arg->ch - '0');
14567 }
14568 }
14569
14570 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014571 if (arg->ch == '.') {
14572 arg->prec = 0;
14573 if (--ctx->fmtcnt >= 0) {
14574 arg->ch = FORMAT_READ(ctx);
14575 ctx->fmtpos++;
14576 }
14577 if (arg->ch == '*') {
14578 v = unicode_format_getnextarg(ctx);
14579 if (v == NULL)
14580 return -1;
14581 if (!PyLong_Check(v)) {
14582 PyErr_SetString(PyExc_TypeError,
14583 "* wants int");
14584 return -1;
14585 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014586 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014587 if (arg->prec == -1 && PyErr_Occurred())
14588 return -1;
14589 if (arg->prec < 0)
14590 arg->prec = 0;
14591 if (--ctx->fmtcnt >= 0) {
14592 arg->ch = FORMAT_READ(ctx);
14593 ctx->fmtpos++;
14594 }
14595 }
14596 else if (arg->ch >= '0' && arg->ch <= '9') {
14597 arg->prec = arg->ch - '0';
14598 while (--ctx->fmtcnt >= 0) {
14599 arg->ch = FORMAT_READ(ctx);
14600 ctx->fmtpos++;
14601 if (arg->ch < '0' || arg->ch > '9')
14602 break;
14603 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14604 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014605 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014606 return -1;
14607 }
14608 arg->prec = arg->prec*10 + (arg->ch - '0');
14609 }
14610 }
14611 }
14612
14613 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14614 if (ctx->fmtcnt >= 0) {
14615 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14616 if (--ctx->fmtcnt >= 0) {
14617 arg->ch = FORMAT_READ(ctx);
14618 ctx->fmtpos++;
14619 }
14620 }
14621 }
14622 if (ctx->fmtcnt < 0) {
14623 PyErr_SetString(PyExc_ValueError,
14624 "incomplete format");
14625 return -1;
14626 }
14627 return 0;
14628
14629#undef FORMAT_READ
14630}
14631
14632/* Format one argument. Supported conversion specifiers:
14633
14634 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014635 - "i", "d", "u": int or float
14636 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014637 - "e", "E", "f", "F", "g", "G": float
14638 - "c": int or str (1 character)
14639
Victor Stinner8dbd4212012-12-04 09:30:24 +010014640 When possible, the output is written directly into the Unicode writer
14641 (ctx->writer). A string is created when padding is required.
14642
Victor Stinnera47082312012-10-04 02:19:54 +020014643 Return 0 if the argument has been formatted into *p_str,
14644 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014645 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014646static int
14647unicode_format_arg_format(struct unicode_formatter_t *ctx,
14648 struct unicode_format_arg_t *arg,
14649 PyObject **p_str)
14650{
14651 PyObject *v;
14652 _PyUnicodeWriter *writer = &ctx->writer;
14653
14654 if (ctx->fmtcnt == 0)
14655 ctx->writer.overallocate = 0;
14656
Victor Stinnera47082312012-10-04 02:19:54 +020014657 v = unicode_format_getnextarg(ctx);
14658 if (v == NULL)
14659 return -1;
14660
Victor Stinnera47082312012-10-04 02:19:54 +020014661
14662 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014663 case 's':
14664 case 'r':
14665 case 'a':
14666 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14667 /* Fast path */
14668 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14669 return -1;
14670 return 1;
14671 }
14672
14673 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14674 *p_str = v;
14675 Py_INCREF(*p_str);
14676 }
14677 else {
14678 if (arg->ch == 's')
14679 *p_str = PyObject_Str(v);
14680 else if (arg->ch == 'r')
14681 *p_str = PyObject_Repr(v);
14682 else
14683 *p_str = PyObject_ASCII(v);
14684 }
14685 break;
14686
14687 case 'i':
14688 case 'd':
14689 case 'u':
14690 case 'o':
14691 case 'x':
14692 case 'X':
14693 {
14694 int ret = mainformatlong(v, arg, p_str, writer);
14695 if (ret != 0)
14696 return ret;
14697 arg->sign = 1;
14698 break;
14699 }
14700
14701 case 'e':
14702 case 'E':
14703 case 'f':
14704 case 'F':
14705 case 'g':
14706 case 'G':
14707 if (arg->width == -1 && arg->prec == -1
14708 && !(arg->flags & (F_SIGN | F_BLANK)))
14709 {
14710 /* Fast path */
14711 if (formatfloat(v, arg, NULL, writer) == -1)
14712 return -1;
14713 return 1;
14714 }
14715
14716 arg->sign = 1;
14717 if (formatfloat(v, arg, p_str, NULL) == -1)
14718 return -1;
14719 break;
14720
14721 case 'c':
14722 {
14723 Py_UCS4 ch = formatchar(v);
14724 if (ch == (Py_UCS4) -1)
14725 return -1;
14726 if (arg->width == -1 && arg->prec == -1) {
14727 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014728 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014729 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014730 return 1;
14731 }
14732 *p_str = PyUnicode_FromOrdinal(ch);
14733 break;
14734 }
14735
14736 default:
14737 PyErr_Format(PyExc_ValueError,
14738 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014739 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014740 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14741 (int)arg->ch,
14742 ctx->fmtpos - 1);
14743 return -1;
14744 }
14745 if (*p_str == NULL)
14746 return -1;
14747 assert (PyUnicode_Check(*p_str));
14748 return 0;
14749}
14750
14751static int
14752unicode_format_arg_output(struct unicode_formatter_t *ctx,
14753 struct unicode_format_arg_t *arg,
14754 PyObject *str)
14755{
14756 Py_ssize_t len;
14757 enum PyUnicode_Kind kind;
14758 void *pbuf;
14759 Py_ssize_t pindex;
14760 Py_UCS4 signchar;
14761 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014762 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014763 Py_ssize_t sublen;
14764 _PyUnicodeWriter *writer = &ctx->writer;
14765 Py_UCS4 fill;
14766
14767 fill = ' ';
14768 if (arg->sign && arg->flags & F_ZERO)
14769 fill = '0';
14770
14771 if (PyUnicode_READY(str) == -1)
14772 return -1;
14773
14774 len = PyUnicode_GET_LENGTH(str);
14775 if ((arg->width == -1 || arg->width <= len)
14776 && (arg->prec == -1 || arg->prec >= len)
14777 && !(arg->flags & (F_SIGN | F_BLANK)))
14778 {
14779 /* Fast path */
14780 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14781 return -1;
14782 return 0;
14783 }
14784
14785 /* Truncate the string for "s", "r" and "a" formats
14786 if the precision is set */
14787 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14788 if (arg->prec >= 0 && len > arg->prec)
14789 len = arg->prec;
14790 }
14791
14792 /* Adjust sign and width */
14793 kind = PyUnicode_KIND(str);
14794 pbuf = PyUnicode_DATA(str);
14795 pindex = 0;
14796 signchar = '\0';
14797 if (arg->sign) {
14798 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14799 if (ch == '-' || ch == '+') {
14800 signchar = ch;
14801 len--;
14802 pindex++;
14803 }
14804 else if (arg->flags & F_SIGN)
14805 signchar = '+';
14806 else if (arg->flags & F_BLANK)
14807 signchar = ' ';
14808 else
14809 arg->sign = 0;
14810 }
14811 if (arg->width < len)
14812 arg->width = len;
14813
14814 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014815 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014816 if (!(arg->flags & F_LJUST)) {
14817 if (arg->sign) {
14818 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014819 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014820 }
14821 else {
14822 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014823 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014824 }
14825 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014826 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14827 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014828 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014829 }
14830
Victor Stinnera47082312012-10-04 02:19:54 +020014831 buflen = arg->width;
14832 if (arg->sign && len == arg->width)
14833 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014834 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014835 return -1;
14836
14837 /* Write the sign if needed */
14838 if (arg->sign) {
14839 if (fill != ' ') {
14840 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14841 writer->pos += 1;
14842 }
14843 if (arg->width > len)
14844 arg->width--;
14845 }
14846
14847 /* Write the numeric prefix for "x", "X" and "o" formats
14848 if the alternate form is used.
14849 For example, write "0x" for the "%#x" format. */
14850 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14851 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14852 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14853 if (fill != ' ') {
14854 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14856 writer->pos += 2;
14857 pindex += 2;
14858 }
14859 arg->width -= 2;
14860 if (arg->width < 0)
14861 arg->width = 0;
14862 len -= 2;
14863 }
14864
14865 /* Pad left with the fill character if needed */
14866 if (arg->width > len && !(arg->flags & F_LJUST)) {
14867 sublen = arg->width - len;
14868 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14869 writer->pos += sublen;
14870 arg->width = len;
14871 }
14872
14873 /* If padding with spaces: write sign if needed and/or numeric prefix if
14874 the alternate form is used */
14875 if (fill == ' ') {
14876 if (arg->sign) {
14877 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14878 writer->pos += 1;
14879 }
14880 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14881 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14882 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14883 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14884 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14885 writer->pos += 2;
14886 pindex += 2;
14887 }
14888 }
14889
14890 /* Write characters */
14891 if (len) {
14892 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14893 str, pindex, len);
14894 writer->pos += len;
14895 }
14896
14897 /* Pad right with the fill character if needed */
14898 if (arg->width > len) {
14899 sublen = arg->width - len;
14900 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14901 writer->pos += sublen;
14902 }
14903 return 0;
14904}
14905
14906/* Helper of PyUnicode_Format(): format one arg.
14907 Return 0 on success, raise an exception and return -1 on error. */
14908static int
14909unicode_format_arg(struct unicode_formatter_t *ctx)
14910{
14911 struct unicode_format_arg_t arg;
14912 PyObject *str;
14913 int ret;
14914
Victor Stinner8dbd4212012-12-04 09:30:24 +010014915 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014916 if (arg.ch == '%') {
14917 ctx->fmtpos++;
14918 ctx->fmtcnt--;
14919 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14920 return -1;
14921 return 0;
14922 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014923 arg.flags = 0;
14924 arg.width = -1;
14925 arg.prec = -1;
14926 arg.sign = 0;
14927 str = NULL;
14928
Victor Stinnera47082312012-10-04 02:19:54 +020014929 ret = unicode_format_arg_parse(ctx, &arg);
14930 if (ret == -1)
14931 return -1;
14932
14933 ret = unicode_format_arg_format(ctx, &arg, &str);
14934 if (ret == -1)
14935 return -1;
14936
14937 if (ret != 1) {
14938 ret = unicode_format_arg_output(ctx, &arg, str);
14939 Py_DECREF(str);
14940 if (ret == -1)
14941 return -1;
14942 }
14943
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014944 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014945 PyErr_SetString(PyExc_TypeError,
14946 "not all arguments converted during string formatting");
14947 return -1;
14948 }
14949 return 0;
14950}
14951
Alexander Belopolsky40018472011-02-26 01:02:56 +000014952PyObject *
14953PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954{
Victor Stinnera47082312012-10-04 02:19:54 +020014955 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014956
Guido van Rossumd57fd912000-03-10 22:53:23 +000014957 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014958 PyErr_BadInternalCall();
14959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960 }
Victor Stinnera47082312012-10-04 02:19:54 +020014961
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014962 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014963 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014964
14965 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14967 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14968 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14969 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014970
Victor Stinner8f674cc2013-04-17 23:02:17 +020014971 _PyUnicodeWriter_Init(&ctx.writer);
14972 ctx.writer.min_length = ctx.fmtcnt + 100;
14973 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014974
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014976 ctx.arglen = PyTuple_Size(args);
14977 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014978 }
14979 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014980 ctx.arglen = -1;
14981 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982 }
Victor Stinnera47082312012-10-04 02:19:54 +020014983 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014984 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014985 ctx.dict = args;
14986 else
14987 ctx.dict = NULL;
14988 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014989
Victor Stinnera47082312012-10-04 02:19:54 +020014990 while (--ctx.fmtcnt >= 0) {
14991 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014992 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014993
14994 nonfmtpos = ctx.fmtpos++;
14995 while (ctx.fmtcnt >= 0 &&
14996 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14997 ctx.fmtpos++;
14998 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 }
Victor Stinnera47082312012-10-04 02:19:54 +020015000 if (ctx.fmtcnt < 0) {
15001 ctx.fmtpos--;
15002 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015003 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015004
Victor Stinnercfc4c132013-04-03 01:48:39 +020015005 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15006 nonfmtpos, ctx.fmtpos) < 0)
15007 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 }
15009 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015010 ctx.fmtpos++;
15011 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015012 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015013 }
15014 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015015
Victor Stinnera47082312012-10-04 02:19:54 +020015016 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015017 PyErr_SetString(PyExc_TypeError,
15018 "not all arguments converted during string formatting");
15019 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015020 }
15021
Victor Stinnera47082312012-10-04 02:19:54 +020015022 if (ctx.args_owned) {
15023 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015024 }
Victor Stinnera47082312012-10-04 02:19:54 +020015025 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015026
Benjamin Peterson29060642009-01-31 22:14:21 +000015027 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015028 _PyUnicodeWriter_Dealloc(&ctx.writer);
15029 if (ctx.args_owned) {
15030 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015031 }
15032 return NULL;
15033}
15034
Jeremy Hylton938ace62002-07-17 16:30:39 +000015035static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015036unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15037
Tim Peters6d6c1a32001-08-02 04:15:00 +000015038static PyObject *
15039unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15040{
Benjamin Peterson29060642009-01-31 22:14:21 +000015041 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 static char *kwlist[] = {"object", "encoding", "errors", 0};
15043 char *encoding = NULL;
15044 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015045
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 if (type != &PyUnicode_Type)
15047 return unicode_subtype_new(type, args, kwds);
15048 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015049 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 return NULL;
15051 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015052 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015053 if (encoding == NULL && errors == NULL)
15054 return PyObject_Str(x);
15055 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015056 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015057}
15058
Guido van Rossume023fe02001-08-30 03:12:59 +000015059static PyObject *
15060unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15061{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015062 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015063 Py_ssize_t length, char_size;
15064 int share_wstr, share_utf8;
15065 unsigned int kind;
15066 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015067
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015070 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015071 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015072 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015073 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015074 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015075 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015076 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015077 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015078
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015079 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015080 if (self == NULL) {
15081 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 return NULL;
15083 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015084 kind = PyUnicode_KIND(unicode);
15085 length = PyUnicode_GET_LENGTH(unicode);
15086
15087 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015088#ifdef Py_DEBUG
15089 _PyUnicode_HASH(self) = -1;
15090#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015091 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015092#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015093 _PyUnicode_STATE(self).interned = 0;
15094 _PyUnicode_STATE(self).kind = kind;
15095 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015096 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015097 _PyUnicode_STATE(self).ready = 1;
15098 _PyUnicode_WSTR(self) = NULL;
15099 _PyUnicode_UTF8_LENGTH(self) = 0;
15100 _PyUnicode_UTF8(self) = NULL;
15101 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015102 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015103
15104 share_utf8 = 0;
15105 share_wstr = 0;
15106 if (kind == PyUnicode_1BYTE_KIND) {
15107 char_size = 1;
15108 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15109 share_utf8 = 1;
15110 }
15111 else if (kind == PyUnicode_2BYTE_KIND) {
15112 char_size = 2;
15113 if (sizeof(wchar_t) == 2)
15114 share_wstr = 1;
15115 }
15116 else {
15117 assert(kind == PyUnicode_4BYTE_KIND);
15118 char_size = 4;
15119 if (sizeof(wchar_t) == 4)
15120 share_wstr = 1;
15121 }
15122
15123 /* Ensure we won't overflow the length. */
15124 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15125 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015126 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015128 data = PyObject_MALLOC((length + 1) * char_size);
15129 if (data == NULL) {
15130 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015131 goto onError;
15132 }
15133
Victor Stinnerc3c74152011-10-02 20:39:55 +020015134 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015135 if (share_utf8) {
15136 _PyUnicode_UTF8_LENGTH(self) = length;
15137 _PyUnicode_UTF8(self) = data;
15138 }
15139 if (share_wstr) {
15140 _PyUnicode_WSTR_LENGTH(self) = length;
15141 _PyUnicode_WSTR(self) = (wchar_t *)data;
15142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015143
Christian Heimesf051e432016-09-13 20:22:02 +020015144 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015145 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015146 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015147#ifdef Py_DEBUG
15148 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15149#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015150 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015151 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015152
15153onError:
15154 Py_DECREF(unicode);
15155 Py_DECREF(self);
15156 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015157}
15158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015159PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015160"str(object='') -> str\n\
15161str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015162\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015163Create a new string object from the given object. If encoding or\n\
15164errors is specified, then the object must expose a data buffer\n\
15165that will be decoded using the given encoding and error handler.\n\
15166Otherwise, returns the result of object.__str__() (if defined)\n\
15167or repr(object).\n\
15168encoding defaults to sys.getdefaultencoding().\n\
15169errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015170
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015171static PyObject *unicode_iter(PyObject *seq);
15172
Guido van Rossumd57fd912000-03-10 22:53:23 +000015173PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015174 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 "str", /* tp_name */
15176 sizeof(PyUnicodeObject), /* tp_size */
15177 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015178 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015179 (destructor)unicode_dealloc, /* tp_dealloc */
15180 0, /* tp_print */
15181 0, /* tp_getattr */
15182 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015183 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015184 unicode_repr, /* tp_repr */
15185 &unicode_as_number, /* tp_as_number */
15186 &unicode_as_sequence, /* tp_as_sequence */
15187 &unicode_as_mapping, /* tp_as_mapping */
15188 (hashfunc) unicode_hash, /* tp_hash*/
15189 0, /* tp_call*/
15190 (reprfunc) unicode_str, /* tp_str */
15191 PyObject_GenericGetAttr, /* tp_getattro */
15192 0, /* tp_setattro */
15193 0, /* tp_as_buffer */
15194 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015195 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 unicode_doc, /* tp_doc */
15197 0, /* tp_traverse */
15198 0, /* tp_clear */
15199 PyUnicode_RichCompare, /* tp_richcompare */
15200 0, /* tp_weaklistoffset */
15201 unicode_iter, /* tp_iter */
15202 0, /* tp_iternext */
15203 unicode_methods, /* tp_methods */
15204 0, /* tp_members */
15205 0, /* tp_getset */
15206 &PyBaseObject_Type, /* tp_base */
15207 0, /* tp_dict */
15208 0, /* tp_descr_get */
15209 0, /* tp_descr_set */
15210 0, /* tp_dictoffset */
15211 0, /* tp_init */
15212 0, /* tp_alloc */
15213 unicode_new, /* tp_new */
15214 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015215};
15216
15217/* Initialize the Unicode implementation */
15218
Victor Stinner3a50e702011-10-18 21:21:00 +020015219int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015220{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015221 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015222 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015223 0x000A, /* LINE FEED */
15224 0x000D, /* CARRIAGE RETURN */
15225 0x001C, /* FILE SEPARATOR */
15226 0x001D, /* GROUP SEPARATOR */
15227 0x001E, /* RECORD SEPARATOR */
15228 0x0085, /* NEXT LINE */
15229 0x2028, /* LINE SEPARATOR */
15230 0x2029, /* PARAGRAPH SEPARATOR */
15231 };
15232
Fred Drakee4315f52000-05-09 19:53:39 +000015233 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015234 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015235 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015236 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015237 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015238
Guido van Rossumcacfc072002-05-24 19:01:59 +000015239 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015240 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015241
15242 /* initialize the linebreak bloom filter */
15243 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015244 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015245 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015246
Christian Heimes26532f72013-07-20 14:57:16 +020015247 if (PyType_Ready(&EncodingMapType) < 0)
15248 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015249
Benjamin Petersonc4311282012-10-30 23:21:10 -040015250 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15251 Py_FatalError("Can't initialize field name iterator type");
15252
15253 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15254 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015255
Victor Stinner3a50e702011-10-18 21:21:00 +020015256 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015257}
15258
15259/* Finalize the Unicode implementation */
15260
Christian Heimesa156e092008-02-16 07:38:31 +000015261int
15262PyUnicode_ClearFreeList(void)
15263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015264 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015265}
15266
Guido van Rossumd57fd912000-03-10 22:53:23 +000015267void
Thomas Wouters78890102000-07-22 19:25:51 +000015268_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015269{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015270 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015271
Serhiy Storchaka05997252013-01-26 12:14:02 +020015272 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015273
Serhiy Storchaka05997252013-01-26 12:14:02 +020015274 for (i = 0; i < 256; i++)
15275 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015276 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015277 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015278}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015279
Walter Dörwald16807132007-05-25 13:52:07 +000015280void
15281PyUnicode_InternInPlace(PyObject **p)
15282{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015283 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015285#ifdef Py_DEBUG
15286 assert(s != NULL);
15287 assert(_PyUnicode_CHECK(s));
15288#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015290 return;
15291#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 /* If it's a subclass, we don't really know what putting
15293 it in the interned dict might do. */
15294 if (!PyUnicode_CheckExact(s))
15295 return;
15296 if (PyUnicode_CHECK_INTERNED(s))
15297 return;
15298 if (interned == NULL) {
15299 interned = PyDict_New();
15300 if (interned == NULL) {
15301 PyErr_Clear(); /* Don't leave an exception */
15302 return;
15303 }
15304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015305 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015306 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015308 if (t == NULL) {
15309 PyErr_Clear();
15310 return;
15311 }
15312 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015313 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015314 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015315 return;
15316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 /* The two references in interned are not counted by refcnt.
15318 The deallocator will take care of this */
15319 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015320 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015321}
15322
15323void
15324PyUnicode_InternImmortal(PyObject **p)
15325{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 PyUnicode_InternInPlace(p);
15327 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015328 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 Py_INCREF(*p);
15330 }
Walter Dörwald16807132007-05-25 13:52:07 +000015331}
15332
15333PyObject *
15334PyUnicode_InternFromString(const char *cp)
15335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 PyObject *s = PyUnicode_FromString(cp);
15337 if (s == NULL)
15338 return NULL;
15339 PyUnicode_InternInPlace(&s);
15340 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015341}
15342
Alexander Belopolsky40018472011-02-26 01:02:56 +000015343void
15344_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015345{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015347 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 Py_ssize_t i, n;
15349 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015350
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 if (interned == NULL || !PyDict_Check(interned))
15352 return;
15353 keys = PyDict_Keys(interned);
15354 if (keys == NULL || !PyList_Check(keys)) {
15355 PyErr_Clear();
15356 return;
15357 }
Walter Dörwald16807132007-05-25 13:52:07 +000015358
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15360 detector, interned unicode strings are not forcibly deallocated;
15361 rather, we give them their stolen references back, and then clear
15362 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015363
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 n = PyList_GET_SIZE(keys);
15365 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015366 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015368 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015369 if (PyUnicode_READY(s) == -1) {
15370 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015371 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015373 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 case SSTATE_NOT_INTERNED:
15375 /* XXX Shouldn't happen */
15376 break;
15377 case SSTATE_INTERNED_IMMORTAL:
15378 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015379 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 break;
15381 case SSTATE_INTERNED_MORTAL:
15382 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015383 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015384 break;
15385 default:
15386 Py_FatalError("Inconsistent interned string state.");
15387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015388 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015389 }
15390 fprintf(stderr, "total size of all interned strings: "
15391 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15392 "mortal/immortal\n", mortal_size, immortal_size);
15393 Py_DECREF(keys);
15394 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015395 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015396}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015397
15398
15399/********************* Unicode Iterator **************************/
15400
15401typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 PyObject_HEAD
15403 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015404 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405} unicodeiterobject;
15406
15407static void
15408unicodeiter_dealloc(unicodeiterobject *it)
15409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 _PyObject_GC_UNTRACK(it);
15411 Py_XDECREF(it->it_seq);
15412 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015413}
15414
15415static int
15416unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15417{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 Py_VISIT(it->it_seq);
15419 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015420}
15421
15422static PyObject *
15423unicodeiter_next(unicodeiterobject *it)
15424{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015425 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015426
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 assert(it != NULL);
15428 seq = it->it_seq;
15429 if (seq == NULL)
15430 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015431 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015433 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15434 int kind = PyUnicode_KIND(seq);
15435 void *data = PyUnicode_DATA(seq);
15436 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15437 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015438 if (item != NULL)
15439 ++it->it_index;
15440 return item;
15441 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015442
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015444 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015445 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015446}
15447
15448static PyObject *
15449unicodeiter_len(unicodeiterobject *it)
15450{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015451 Py_ssize_t len = 0;
15452 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015453 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015455}
15456
15457PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15458
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015459static PyObject *
15460unicodeiter_reduce(unicodeiterobject *it)
15461{
15462 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015463 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015464 it->it_seq, it->it_index);
15465 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015466 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015467 if (u == NULL)
15468 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015469 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015470 }
15471}
15472
15473PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15474
15475static PyObject *
15476unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15477{
15478 Py_ssize_t index = PyLong_AsSsize_t(state);
15479 if (index == -1 && PyErr_Occurred())
15480 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015481 if (it->it_seq != NULL) {
15482 if (index < 0)
15483 index = 0;
15484 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15485 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15486 it->it_index = index;
15487 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015488 Py_RETURN_NONE;
15489}
15490
15491PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15492
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015493static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015495 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015496 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15497 reduce_doc},
15498 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15499 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015501};
15502
15503PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15505 "str_iterator", /* tp_name */
15506 sizeof(unicodeiterobject), /* tp_basicsize */
15507 0, /* tp_itemsize */
15508 /* methods */
15509 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15510 0, /* tp_print */
15511 0, /* tp_getattr */
15512 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015513 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015514 0, /* tp_repr */
15515 0, /* tp_as_number */
15516 0, /* tp_as_sequence */
15517 0, /* tp_as_mapping */
15518 0, /* tp_hash */
15519 0, /* tp_call */
15520 0, /* tp_str */
15521 PyObject_GenericGetAttr, /* tp_getattro */
15522 0, /* tp_setattro */
15523 0, /* tp_as_buffer */
15524 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15525 0, /* tp_doc */
15526 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15527 0, /* tp_clear */
15528 0, /* tp_richcompare */
15529 0, /* tp_weaklistoffset */
15530 PyObject_SelfIter, /* tp_iter */
15531 (iternextfunc)unicodeiter_next, /* tp_iternext */
15532 unicodeiter_methods, /* tp_methods */
15533 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015534};
15535
15536static PyObject *
15537unicode_iter(PyObject *seq)
15538{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015539 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015540
Benjamin Peterson14339b62009-01-31 16:36:08 +000015541 if (!PyUnicode_Check(seq)) {
15542 PyErr_BadInternalCall();
15543 return NULL;
15544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015545 if (PyUnicode_READY(seq) == -1)
15546 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015547 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15548 if (it == NULL)
15549 return NULL;
15550 it->it_index = 0;
15551 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015552 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015553 _PyObject_GC_TRACK(it);
15554 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015555}
15556
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015557
15558size_t
15559Py_UNICODE_strlen(const Py_UNICODE *u)
15560{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015561 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015562}
15563
15564Py_UNICODE*
15565Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15566{
15567 Py_UNICODE *u = s1;
15568 while ((*u++ = *s2++));
15569 return s1;
15570}
15571
15572Py_UNICODE*
15573Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15574{
15575 Py_UNICODE *u = s1;
15576 while ((*u++ = *s2++))
15577 if (n-- == 0)
15578 break;
15579 return s1;
15580}
15581
15582Py_UNICODE*
15583Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15584{
15585 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015586 u1 += wcslen(u1);
15587 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015588 return s1;
15589}
15590
15591int
15592Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15593{
15594 while (*s1 && *s2 && *s1 == *s2)
15595 s1++, s2++;
15596 if (*s1 && *s2)
15597 return (*s1 < *s2) ? -1 : +1;
15598 if (*s1)
15599 return 1;
15600 if (*s2)
15601 return -1;
15602 return 0;
15603}
15604
15605int
15606Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15607{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015608 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015609 for (; n != 0; n--) {
15610 u1 = *s1;
15611 u2 = *s2;
15612 if (u1 != u2)
15613 return (u1 < u2) ? -1 : +1;
15614 if (u1 == '\0')
15615 return 0;
15616 s1++;
15617 s2++;
15618 }
15619 return 0;
15620}
15621
15622Py_UNICODE*
15623Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15624{
15625 const Py_UNICODE *p;
15626 for (p = s; *p; p++)
15627 if (*p == c)
15628 return (Py_UNICODE*)p;
15629 return NULL;
15630}
15631
15632Py_UNICODE*
15633Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15634{
15635 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015636 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015637 while (p != s) {
15638 p--;
15639 if (*p == c)
15640 return (Py_UNICODE*)p;
15641 }
15642 return NULL;
15643}
Victor Stinner331ea922010-08-10 16:37:20 +000015644
Victor Stinner71133ff2010-09-01 23:43:53 +000015645Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015646PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015647{
Victor Stinner577db2c2011-10-11 22:12:48 +020015648 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015649 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015651 if (!PyUnicode_Check(unicode)) {
15652 PyErr_BadArgument();
15653 return NULL;
15654 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015655 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015656 if (u == NULL)
15657 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015658 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015659 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015660 PyErr_NoMemory();
15661 return NULL;
15662 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015663 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015664 size *= sizeof(Py_UNICODE);
15665 copy = PyMem_Malloc(size);
15666 if (copy == NULL) {
15667 PyErr_NoMemory();
15668 return NULL;
15669 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015670 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015671 return copy;
15672}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015673
Georg Brandl66c221e2010-10-14 07:04:07 +000015674/* A _string module, to export formatter_parser and formatter_field_name_split
15675 to the string.Formatter class implemented in Python. */
15676
15677static PyMethodDef _string_methods[] = {
15678 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15679 METH_O, PyDoc_STR("split the argument as a field name")},
15680 {"formatter_parser", (PyCFunction) formatter_parser,
15681 METH_O, PyDoc_STR("parse the argument as a format string")},
15682 {NULL, NULL}
15683};
15684
15685static struct PyModuleDef _string_module = {
15686 PyModuleDef_HEAD_INIT,
15687 "_string",
15688 PyDoc_STR("string helper module"),
15689 0,
15690 _string_methods,
15691 NULL,
15692 NULL,
15693 NULL,
15694 NULL
15695};
15696
15697PyMODINIT_FUNC
15698PyInit__string(void)
15699{
15700 return PyModule_Create(&_string_module);
15701}
15702
15703
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015704#ifdef __cplusplus
15705}
15706#endif