blob: d46ab2a1e2ab8d24a0c39488d69729ba6165792c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100221#define FILL(kind, data, value, start, length) \
222 do { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100223 assert(0 <= start); \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100224 assert(kind != PyUnicode_WCHAR_KIND); \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100225 switch (kind) { \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100226 case PyUnicode_1BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100227 assert(value <= 0xff); \
228 Py_UCS1 ch = (unsigned char)value; \
229 Py_UCS1 *to = (Py_UCS1 *)data + start; \
230 memset(to, ch, length); \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100231 break; \
232 } \
233 case PyUnicode_2BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100234 assert(value <= 0xffff); \
235 Py_UCS2 ch = (Py_UCS2)value; \
236 Py_UCS2 *to = (Py_UCS2 *)data + start; \
237 const Py_UCS2 *end = to + length; \
238 for (; to < end; ++to) *to = ch; \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100239 break; \
240 } \
241 case PyUnicode_4BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100242 assert(value <= MAX_UNICODE); \
243 Py_UCS4 ch = value; \
244 Py_UCS4 * to = (Py_UCS4 *)data + start; \
245 const Py_UCS4 *end = to + length; \
246 for (; to < end; ++to) *to = ch; \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100247 break; \
248 } \
249 default: Py_UNREACHABLE(); \
250 } \
251 } while (0)
252
253
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200254/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700255static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200256_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200258/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200259static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200260
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261/* Single character Unicode strings in the Latin-1 range are being
262 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Fast detection of the most frequent whitespace characters */
266const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* case 0x000C: * FORM FEED */
272/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 1, 1, 1, 1, 1, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x001C: * FILE SEPARATOR */
276/* case 0x001D: * GROUP SEPARATOR */
277/* case 0x001E: * RECORD SEPARATOR */
278/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 1, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000285
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000294};
295
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200296/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200297static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200298static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100299static int unicode_modifiable(PyObject *unicode);
300
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301
Alexander Belopolsky40018472011-02-26 01:02:56 +0000302static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100303_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200304static PyObject *
305_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306static PyObject *
307_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308
309static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000311 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100312 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000313 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314
Alexander Belopolsky40018472011-02-26 01:02:56 +0000315static void
316raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300317 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100318 PyObject *unicode,
319 Py_ssize_t startpos, Py_ssize_t endpos,
320 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000321
Christian Heimes190d79e2008-01-30 11:58:22 +0000322/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200323static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000324 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000325/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000326/* 0x000B, * LINE TABULATION */
327/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000328/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000329 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000330 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000331/* 0x001C, * FILE SEPARATOR */
332/* 0x001D, * GROUP SEPARATOR */
333/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 1, 1, 1, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000339
Benjamin Peterson14339b62009-01-31 16:36:08 +0000340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000348};
349
INADA Naoki3ae20562017-01-16 20:41:20 +0900350static int convert_uc(PyObject *obj, void *addr);
351
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300352#include "clinic/unicodeobject.c.h"
353
Victor Stinner50149202015-09-22 00:26:54 +0200354typedef enum {
355 _Py_ERROR_UNKNOWN=0,
356 _Py_ERROR_STRICT,
357 _Py_ERROR_SURROGATEESCAPE,
358 _Py_ERROR_REPLACE,
359 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200360 _Py_ERROR_BACKSLASHREPLACE,
361 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200362 _Py_ERROR_XMLCHARREFREPLACE,
363 _Py_ERROR_OTHER
364} _Py_error_handler;
365
366static _Py_error_handler
367get_error_handler(const char *errors)
368{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200370 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200371 }
372 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200373 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200374 }
375 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200379 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200382 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
Victor Stinner50149202015-09-22 00:26:54 +0200390 return _Py_ERROR_OTHER;
391}
392
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300393/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000395Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000396PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000397{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000398#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000399 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000400#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000401 /* This is actually an illegal character, so it should
402 not be passed to unichr. */
403 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000404#endif
405}
406
Victor Stinner910337b2011-10-03 03:20:16 +0200407#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200408int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100409_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200410{
411 PyASCIIObject *ascii;
412 unsigned int kind;
413
414 assert(PyUnicode_Check(op));
415
416 ascii = (PyASCIIObject *)op;
417 kind = ascii->state.kind;
418
Victor Stinnera3b334d2011-10-03 13:53:37 +0200419 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200420 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200421 assert(ascii->state.ready == 1);
422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200424 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200425 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200426
Victor Stinnera41463c2011-10-04 01:05:08 +0200427 if (ascii->state.compact == 1) {
428 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200429 assert(kind == PyUnicode_1BYTE_KIND
430 || kind == PyUnicode_2BYTE_KIND
431 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200432 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200433 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200434 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100435 }
436 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200437 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438
439 data = unicode->data.any;
440 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100441 assert(ascii->length == 0);
442 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200443 assert(ascii->state.compact == 0);
444 assert(ascii->state.ascii == 0);
445 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100446 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 assert(ascii->wstr != NULL);
448 assert(data == NULL);
449 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200450 }
451 else {
452 assert(kind == PyUnicode_1BYTE_KIND
453 || kind == PyUnicode_2BYTE_KIND
454 || kind == PyUnicode_4BYTE_KIND);
455 assert(ascii->state.compact == 0);
456 assert(ascii->state.ready == 1);
457 assert(data != NULL);
458 if (ascii->state.ascii) {
459 assert (compact->utf8 == data);
460 assert (compact->utf8_length == ascii->length);
461 }
462 else
463 assert (compact->utf8 != data);
464 }
465 }
466 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200467 if (
468#if SIZEOF_WCHAR_T == 2
469 kind == PyUnicode_2BYTE_KIND
470#else
471 kind == PyUnicode_4BYTE_KIND
472#endif
473 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200474 {
475 assert(ascii->wstr == data);
476 assert(compact->wstr_length == ascii->length);
477 } else
478 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200480
481 if (compact->utf8 == NULL)
482 assert(compact->utf8_length == 0);
483 if (ascii->wstr == NULL)
484 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200485 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200486 /* check that the best kind is used */
487 if (check_content && kind != PyUnicode_WCHAR_KIND)
488 {
489 Py_ssize_t i;
490 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 void *data;
492 Py_UCS4 ch;
493
494 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200495 for (i=0; i < ascii->length; i++)
496 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200497 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200498 if (ch > maxchar)
499 maxchar = ch;
500 }
501 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100502 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200503 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100504 assert(maxchar <= 255);
505 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200506 else
507 assert(maxchar < 128);
508 }
Victor Stinner77faf692011-11-20 18:56:05 +0100509 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200510 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100511 assert(maxchar <= 0xFFFF);
512 }
513 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200514 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100515 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100516 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200517 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200518 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400519 return 1;
520}
Victor Stinner910337b2011-10-03 03:20:16 +0200521#endif
522
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100523static PyObject*
524unicode_result_wchar(PyObject *unicode)
525{
526#ifndef Py_DEBUG
527 Py_ssize_t len;
528
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 len = _PyUnicode_WSTR_LENGTH(unicode);
530 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100531 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200532 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 }
534
535 if (len == 1) {
536 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100537 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100538 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539 Py_DECREF(unicode);
540 return latin1_char;
541 }
542 }
543
544 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200545 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100546 return NULL;
547 }
548#else
Victor Stinneraa771272012-10-04 02:32:58 +0200549 assert(Py_REFCNT(unicode) == 1);
550
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100551 /* don't make the result ready in debug mode to ensure that the caller
552 makes the string ready before using it */
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554#endif
555 return unicode;
556}
557
558static PyObject*
559unicode_result_ready(PyObject *unicode)
560{
561 Py_ssize_t length;
562
563 length = PyUnicode_GET_LENGTH(unicode);
564 if (length == 0) {
565 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100566 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200567 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100568 }
569 return unicode_empty;
570 }
571
572 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200573 void *data = PyUnicode_DATA(unicode);
574 int kind = PyUnicode_KIND(unicode);
575 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100576 if (ch < 256) {
577 PyObject *latin1_char = unicode_latin1[ch];
578 if (latin1_char != NULL) {
579 if (unicode != latin1_char) {
580 Py_INCREF(latin1_char);
581 Py_DECREF(unicode);
582 }
583 return latin1_char;
584 }
585 else {
586 assert(_PyUnicode_CheckConsistency(unicode, 1));
587 Py_INCREF(unicode);
588 unicode_latin1[ch] = unicode;
589 return unicode;
590 }
591 }
592 }
593
594 assert(_PyUnicode_CheckConsistency(unicode, 1));
595 return unicode;
596}
597
598static PyObject*
599unicode_result(PyObject *unicode)
600{
601 assert(_PyUnicode_CHECK(unicode));
602 if (PyUnicode_IS_READY(unicode))
603 return unicode_result_ready(unicode);
604 else
605 return unicode_result_wchar(unicode);
606}
607
Victor Stinnerc4b49542011-12-11 22:44:26 +0100608static PyObject*
609unicode_result_unchanged(PyObject *unicode)
610{
611 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500612 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100613 return NULL;
614 Py_INCREF(unicode);
615 return unicode;
616 }
617 else
618 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100619 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100620}
621
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623 ASCII, Latin1, UTF-8, etc. */
624static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200625backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200626 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627{
Victor Stinnerad771582015-10-09 12:38:53 +0200628 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629 Py_UCS4 ch;
630 enum PyUnicode_Kind kind;
631 void *data;
632
633 assert(PyUnicode_IS_READY(unicode));
634 kind = PyUnicode_KIND(unicode);
635 data = PyUnicode_DATA(unicode);
636
637 size = 0;
638 /* determine replacement size */
639 for (i = collstart; i < collend; ++i) {
640 Py_ssize_t incr;
641
642 ch = PyUnicode_READ(kind, data, i);
643 if (ch < 0x100)
644 incr = 2+2;
645 else if (ch < 0x10000)
646 incr = 2+4;
647 else {
648 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200649 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200650 }
651 if (size > PY_SSIZE_T_MAX - incr) {
652 PyErr_SetString(PyExc_OverflowError,
653 "encoded result is too long for a Python string");
654 return NULL;
655 }
656 size += incr;
657 }
658
Victor Stinnerad771582015-10-09 12:38:53 +0200659 str = _PyBytesWriter_Prepare(writer, str, size);
660 if (str == NULL)
661 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662
663 /* generate replacement */
664 for (i = collstart; i < collend; ++i) {
665 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200666 *str++ = '\\';
667 if (ch >= 0x00010000) {
668 *str++ = 'U';
669 *str++ = Py_hexdigits[(ch>>28)&0xf];
670 *str++ = Py_hexdigits[(ch>>24)&0xf];
671 *str++ = Py_hexdigits[(ch>>20)&0xf];
672 *str++ = Py_hexdigits[(ch>>16)&0xf];
673 *str++ = Py_hexdigits[(ch>>12)&0xf];
674 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200675 }
Victor Stinner797485e2015-10-09 03:17:30 +0200676 else if (ch >= 0x100) {
677 *str++ = 'u';
678 *str++ = Py_hexdigits[(ch>>12)&0xf];
679 *str++ = Py_hexdigits[(ch>>8)&0xf];
680 }
681 else
682 *str++ = 'x';
683 *str++ = Py_hexdigits[(ch>>4)&0xf];
684 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200685 }
686 return str;
687}
688
689/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690 ASCII, Latin1, UTF-8, etc. */
691static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200692xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200693 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694{
Victor Stinnerad771582015-10-09 12:38:53 +0200695 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696 Py_UCS4 ch;
697 enum PyUnicode_Kind kind;
698 void *data;
699
700 assert(PyUnicode_IS_READY(unicode));
701 kind = PyUnicode_KIND(unicode);
702 data = PyUnicode_DATA(unicode);
703
704 size = 0;
705 /* determine replacement size */
706 for (i = collstart; i < collend; ++i) {
707 Py_ssize_t incr;
708
709 ch = PyUnicode_READ(kind, data, i);
710 if (ch < 10)
711 incr = 2+1+1;
712 else if (ch < 100)
713 incr = 2+2+1;
714 else if (ch < 1000)
715 incr = 2+3+1;
716 else if (ch < 10000)
717 incr = 2+4+1;
718 else if (ch < 100000)
719 incr = 2+5+1;
720 else if (ch < 1000000)
721 incr = 2+6+1;
722 else {
723 assert(ch <= MAX_UNICODE);
724 incr = 2+7+1;
725 }
726 if (size > PY_SSIZE_T_MAX - incr) {
727 PyErr_SetString(PyExc_OverflowError,
728 "encoded result is too long for a Python string");
729 return NULL;
730 }
731 size += incr;
732 }
733
Victor Stinnerad771582015-10-09 12:38:53 +0200734 str = _PyBytesWriter_Prepare(writer, str, size);
735 if (str == NULL)
736 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200737
738 /* generate replacement */
739 for (i = collstart; i < collend; ++i) {
740 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741 }
742 return str;
743}
744
Thomas Wouters477c8d52006-05-27 19:21:47 +0000745/* --- Bloom Filters ----------------------------------------------------- */
746
747/* stuff to implement simple "bloom filters" for Unicode characters.
748 to keep things simple, we use a single bitmask, using the least 5
749 bits from each unicode characters as the bit index. */
750
751/* the linebreak mask is set up by Unicode_Init below */
752
Antoine Pitrouf068f942010-01-13 14:19:12 +0000753#if LONG_BIT >= 128
754#define BLOOM_WIDTH 128
755#elif LONG_BIT >= 64
756#define BLOOM_WIDTH 64
757#elif LONG_BIT >= 32
758#define BLOOM_WIDTH 32
759#else
760#error "LONG_BIT is smaller than 32"
761#endif
762
Thomas Wouters477c8d52006-05-27 19:21:47 +0000763#define BLOOM_MASK unsigned long
764
Serhiy Storchaka05997252013-01-26 12:14:02 +0200765static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000766
Antoine Pitrouf068f942010-01-13 14:19:12 +0000767#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson29060642009-01-31 22:14:21 +0000769#define BLOOM_LINEBREAK(ch) \
770 ((ch) < 128U ? ascii_linebreak[(ch)] : \
771 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700773static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775{
Victor Stinnera85af502013-04-09 21:53:54 +0200776#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
777 do { \
778 TYPE *data = (TYPE *)PTR; \
779 TYPE *end = data + LEN; \
780 Py_UCS4 ch; \
781 for (; data != end; data++) { \
782 ch = *data; \
783 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784 } \
785 break; \
786 } while (0)
787
Thomas Wouters477c8d52006-05-27 19:21:47 +0000788 /* calculate simple bloom-style bitmask for a given unicode string */
789
Antoine Pitrouf068f942010-01-13 14:19:12 +0000790 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000791
792 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200793 switch (kind) {
794 case PyUnicode_1BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796 break;
797 case PyUnicode_2BYTE_KIND:
798 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799 break;
800 case PyUnicode_4BYTE_KIND:
801 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802 break;
803 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700804 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200805 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200807
808#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809}
810
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300811static int
812ensure_unicode(PyObject *obj)
813{
814 if (!PyUnicode_Check(obj)) {
815 PyErr_Format(PyExc_TypeError,
816 "must be str, not %.100s",
817 Py_TYPE(obj)->tp_name);
818 return -1;
819 }
820 return PyUnicode_READY(obj);
821}
822
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200823/* Compilation of templated routines */
824
825#include "stringlib/asciilib.h"
826#include "stringlib/fastsearch.h"
827#include "stringlib/partition.h"
828#include "stringlib/split.h"
829#include "stringlib/count.h"
830#include "stringlib/find.h"
831#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200832#include "stringlib/undef.h"
833
834#include "stringlib/ucs1lib.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/partition.h"
837#include "stringlib/split.h"
838#include "stringlib/count.h"
839#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300840#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200841#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200842#include "stringlib/undef.h"
843
844#include "stringlib/ucs2lib.h"
845#include "stringlib/fastsearch.h"
846#include "stringlib/partition.h"
847#include "stringlib/split.h"
848#include "stringlib/count.h"
849#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300850#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200851#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200852#include "stringlib/undef.h"
853
854#include "stringlib/ucs4lib.h"
855#include "stringlib/fastsearch.h"
856#include "stringlib/partition.h"
857#include "stringlib/split.h"
858#include "stringlib/count.h"
859#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300860#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200861#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200862#include "stringlib/undef.h"
863
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200864#include "stringlib/unicodedefs.h"
865#include "stringlib/fastsearch.h"
866#include "stringlib/count.h"
867#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100868#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200869
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870/* --- Unicode Object ----------------------------------------------------- */
871
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700872static inline Py_ssize_t
873findchar(const void *s, int kind,
874 Py_ssize_t size, Py_UCS4 ch,
875 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200877 switch (kind) {
878 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200879 if ((Py_UCS1) ch != ch)
880 return -1;
881 if (direction > 0)
882 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883 else
884 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200885 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200886 if ((Py_UCS2) ch != ch)
887 return -1;
888 if (direction > 0)
889 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890 else
891 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200892 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200893 if (direction > 0)
894 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895 else
896 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700898 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901
Victor Stinnerafffce42012-10-03 23:03:17 +0200902#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000903/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200904 earlier.
905
906 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908 invalid character in Unicode 6.0. */
909static void
910unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911{
912 int kind = PyUnicode_KIND(unicode);
913 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915 if (length <= old_length)
916 return;
917 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918}
919#endif
920
Victor Stinnerfe226c02011-10-03 03:52:20 +0200921static PyObject*
922resize_compact(PyObject *unicode, Py_ssize_t length)
923{
924 Py_ssize_t char_size;
925 Py_ssize_t struct_size;
926 Py_ssize_t new_size;
927 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100928 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200929#ifdef Py_DEBUG
930 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931#endif
932
Victor Stinner79891572012-05-03 13:43:07 +0200933 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200934 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100935 assert(PyUnicode_IS_COMPACT(unicode));
936
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200937 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100938 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 struct_size = sizeof(PyASCIIObject);
940 else
941 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945 PyErr_NoMemory();
946 return NULL;
947 }
948 new_size = (struct_size + (length + 1) * char_size);
949
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951 PyObject_DEL(_PyUnicode_UTF8(unicode));
952 _PyUnicode_UTF8(unicode) = NULL;
953 _PyUnicode_UTF8_LENGTH(unicode) = 0;
954 }
Victor Stinner84def372011-12-11 20:04:56 +0100955 _Py_DEC_REFTOTAL;
956 _Py_ForgetReference(unicode);
957
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300958 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100959 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100960 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 PyErr_NoMemory();
962 return NULL;
963 }
Victor Stinner84def372011-12-11 20:04:56 +0100964 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100966
Victor Stinnerfe226c02011-10-03 03:52:20 +0200967 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200968 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200969 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100970 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200971 _PyUnicode_WSTR_LENGTH(unicode) = length;
972 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100973 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974 PyObject_DEL(_PyUnicode_WSTR(unicode));
975 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100976 if (!PyUnicode_IS_ASCII(unicode))
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100978 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200979#ifdef Py_DEBUG
980 unicode_fill_invalid(unicode, old_length);
981#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200984 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200985 return unicode;
986}
987
Alexander Belopolsky40018472011-02-26 01:02:56 +0000988static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200989resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990{
Victor Stinner95663112011-10-04 01:03:50 +0200991 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100992 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 if (PyUnicode_IS_READY(unicode)) {
997 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200998 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001000#ifdef Py_DEBUG
1001 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003
1004 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001005 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001006 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008
1009 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010 PyErr_NoMemory();
1011 return -1;
1012 }
1013 new_size = (length + 1) * char_size;
1014
Victor Stinner7a9105a2011-12-12 00:13:42 +01001015 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016 {
1017 PyObject_DEL(_PyUnicode_UTF8(unicode));
1018 _PyUnicode_UTF8(unicode) = NULL;
1019 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020 }
1021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 data = (PyObject *)PyObject_REALLOC(data, new_size);
1023 if (data == NULL) {
1024 PyErr_NoMemory();
1025 return -1;
1026 }
1027 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001028 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_WSTR_LENGTH(unicode) = length;
1031 }
1032 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001033 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001034 _PyUnicode_UTF8_LENGTH(unicode) = length;
1035 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 _PyUnicode_LENGTH(unicode) = length;
1037 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 unicode_fill_invalid(unicode, old_length);
1040#endif
Victor Stinner95663112011-10-04 01:03:50 +02001041 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001042 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 }
Victor Stinner95663112011-10-04 01:03:50 +02001046 assert(_PyUnicode_WSTR(unicode) != NULL);
1047
1048 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001049 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001050 PyErr_NoMemory();
1051 return -1;
1052 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001053 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001054 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001055 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001056 if (!wstr) {
1057 PyErr_NoMemory();
1058 return -1;
1059 }
1060 _PyUnicode_WSTR(unicode) = wstr;
1061 _PyUnicode_WSTR(unicode)[length] = 0;
1062 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001063 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 return 0;
1065}
1066
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067static PyObject*
1068resize_copy(PyObject *unicode, Py_ssize_t length)
1069{
1070 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001071 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001073
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001074 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001075
1076 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077 if (copy == NULL)
1078 return NULL;
1079
1080 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001083 }
1084 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001085 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001086
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001087 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 if (w == NULL)
1089 return NULL;
1090 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001092 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001093 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001094 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 }
1096}
1097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001099 Ux0000 terminated; some code (e.g. new_identifier)
1100 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001103 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
1105*/
1106
Alexander Belopolsky40018472011-02-26 01:02:56 +00001107static PyUnicodeObject *
1108_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001110 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112
Thomas Wouters477c8d52006-05-27 19:21:47 +00001113 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 if (length == 0 && unicode_empty != NULL) {
1115 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001116 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 }
1118
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001119 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001120 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001121 return (PyUnicodeObject *)PyErr_NoMemory();
1122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 if (length < 0) {
1124 PyErr_SetString(PyExc_SystemError,
1125 "Negative size passed to _PyUnicode_New");
1126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 }
1128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130 if (unicode == NULL)
1131 return NULL;
1132 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001133
1134 _PyUnicode_WSTR_LENGTH(unicode) = length;
1135 _PyUnicode_HASH(unicode) = -1;
1136 _PyUnicode_STATE(unicode).interned = 0;
1137 _PyUnicode_STATE(unicode).kind = 0;
1138 _PyUnicode_STATE(unicode).compact = 0;
1139 _PyUnicode_STATE(unicode).ready = 0;
1140 _PyUnicode_STATE(unicode).ascii = 0;
1141 _PyUnicode_DATA_ANY(unicode) = NULL;
1142 _PyUnicode_LENGTH(unicode) = 0;
1143 _PyUnicode_UTF8(unicode) = NULL;
1144 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001148 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001150 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Jeremy Hyltond8082792003-09-16 19:41:39 +00001153 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001154 * the caller fails before initializing str -- unicode_resize()
1155 * reads str[0], and the Keep-Alive optimization can keep memory
1156 * allocated for str alive across a call to unicode_dealloc(unicode).
1157 * We don't want unicode_resize to read uninitialized memory in
1158 * that case.
1159 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 _PyUnicode_WSTR(unicode)[0] = 0;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001162
Victor Stinner7931d9a2011-11-04 00:22:48 +01001163 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 return unicode;
1165}
1166
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167static const char*
1168unicode_kind_name(PyObject *unicode)
1169{
Victor Stinner42dfd712011-10-03 14:41:45 +02001170 /* don't check consistency: unicode_kind_name() is called from
1171 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 if (!PyUnicode_IS_COMPACT(unicode))
1173 {
1174 if (!PyUnicode_IS_READY(unicode))
1175 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001176 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001177 {
1178 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001179 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001180 return "legacy ascii";
1181 else
1182 return "legacy latin1";
1183 case PyUnicode_2BYTE_KIND:
1184 return "legacy UCS2";
1185 case PyUnicode_4BYTE_KIND:
1186 return "legacy UCS4";
1187 default:
1188 return "<legacy invalid kind>";
1189 }
1190 }
1191 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001192 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001193 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001194 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001195 return "ascii";
1196 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001199 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001202 default:
1203 return "<invalid compact kind>";
1204 }
1205}
1206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208/* Functions wrapping macros for use in debugger */
1209char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001210 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211}
1212
1213void *_PyUnicode_compact_data(void *unicode) {
1214 return _PyUnicode_COMPACT_DATA(unicode);
1215}
1216void *_PyUnicode_data(void *unicode){
1217 printf("obj %p\n", unicode);
1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223 return PyUnicode_DATA(unicode);
1224}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001225
1226void
1227_PyUnicode_Dump(PyObject *op)
1228{
1229 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001233
Victor Stinnera849a4b2011-10-03 12:12:11 +02001234 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001235 {
1236 if (ascii->state.ascii)
1237 data = (ascii + 1);
1238 else
1239 data = (compact + 1);
1240 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001241 else
1242 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001245
Victor Stinnera849a4b2011-10-03 12:12:11 +02001246 if (ascii->wstr == data)
1247 printf("shared ");
1248 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001249
Victor Stinnera3b334d2011-10-03 13:53:37 +02001250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001252 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001256 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001257 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001258}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259#endif
1260
1261PyObject *
1262PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263{
1264 PyObject *obj;
1265 PyCompactUnicodeObject *unicode;
1266 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001267 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001268 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 Py_ssize_t char_size;
1270 Py_ssize_t struct_size;
1271
1272 /* Optimization for empty strings */
1273 if (size == 0 && unicode_empty != NULL) {
1274 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001275 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 }
1277
Victor Stinner9e9d6892011-10-04 01:02:02 +02001278 is_ascii = 0;
1279 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 struct_size = sizeof(PyCompactUnicodeObject);
1281 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001282 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 char_size = 1;
1284 is_ascii = 1;
1285 struct_size = sizeof(PyASCIIObject);
1286 }
1287 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001288 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 char_size = 1;
1290 }
1291 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001292 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 char_size = 2;
1294 if (sizeof(wchar_t) == 2)
1295 is_sharing = 1;
1296 }
1297 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001298 if (maxchar > MAX_UNICODE) {
1299 PyErr_SetString(PyExc_SystemError,
1300 "invalid maximum character passed to PyUnicode_New");
1301 return NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 char_size = 4;
1305 if (sizeof(wchar_t) == 4)
1306 is_sharing = 1;
1307 }
1308
1309 /* Ensure we won't overflow the size. */
1310 if (size < 0) {
1311 PyErr_SetString(PyExc_SystemError,
1312 "Negative size passed to PyUnicode_New");
1313 return NULL;
1314 }
1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316 return PyErr_NoMemory();
1317
1318 /* Duplicated allocation code from _PyObject_New() instead of a call to
1319 * PyObject_New() so we are able to allocate space for the object and
1320 * it's data buffer.
1321 */
1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323 if (obj == NULL)
1324 return PyErr_NoMemory();
1325 obj = PyObject_INIT(obj, &PyUnicode_Type);
1326 if (obj == NULL)
1327 return NULL;
1328
1329 unicode = (PyCompactUnicodeObject *)obj;
1330 if (is_ascii)
1331 data = ((PyASCIIObject*)obj) + 1;
1332 else
1333 data = unicode + 1;
1334 _PyUnicode_LENGTH(unicode) = size;
1335 _PyUnicode_HASH(unicode) = -1;
1336 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001337 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 _PyUnicode_STATE(unicode).compact = 1;
1339 _PyUnicode_STATE(unicode).ready = 1;
1340 _PyUnicode_STATE(unicode).ascii = is_ascii;
1341 if (is_ascii) {
1342 ((char*)data)[size] = 0;
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 }
Victor Stinner8f825062012-04-27 13:55:39 +02001345 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 ((char*)data)[size] = 0;
1347 _PyUnicode_WSTR(unicode) = NULL;
1348 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001350 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 else {
1353 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001354 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001355 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001357 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 ((Py_UCS4*)data)[size] = 0;
1359 if (is_sharing) {
1360 _PyUnicode_WSTR_LENGTH(unicode) = size;
1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362 }
1363 else {
1364 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 }
1367 }
Victor Stinner8f825062012-04-27 13:55:39 +02001368#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001369 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001370#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 return obj;
1373}
1374
1375#if SIZEOF_WCHAR_T == 2
1376/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001378 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379
1380 This function assumes that unicode can hold one more code point than wstr
1381 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001382static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001384 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385{
1386 const wchar_t *iter;
1387 Py_UCS4 *ucs4_out;
1388
Victor Stinner910337b2011-10-03 03:20:16 +02001389 assert(unicode != NULL);
1390 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393
1394 for (iter = begin; iter < end; ) {
1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398 && (iter+1) < end
1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 {
Victor Stinner551ac952011-11-29 22:58:13 +01001401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 iter += 2;
1403 }
1404 else {
1405 *ucs4_out++ = *iter;
1406 iter++;
1407 }
1408 }
1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410 _PyUnicode_GET_LENGTH(unicode)));
1411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412}
1413#endif
1414
Victor Stinnercd9950f2011-10-02 00:34:53 +02001415static int
Victor Stinner488fa492011-12-12 00:01:39 +01001416unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001417{
Victor Stinner488fa492011-12-12 00:01:39 +01001418 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001419 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001420 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001421 return -1;
1422 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001423 return 0;
1424}
1425
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001426static int
1427_copy_characters(PyObject *to, Py_ssize_t to_start,
1428 PyObject *from, Py_ssize_t from_start,
1429 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001431 unsigned int from_kind, to_kind;
1432 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433
Victor Stinneree4544c2012-05-09 22:24:08 +02001434 assert(0 <= how_many);
1435 assert(0 <= from_start);
1436 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001438 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440
Victor Stinnerd3f08822012-05-29 12:57:52 +02001441 assert(PyUnicode_Check(to));
1442 assert(PyUnicode_IS_READY(to));
1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 if (how_many == 0)
1446 return 0;
1447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001449 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001451 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
Victor Stinnerf1852262012-06-16 16:38:26 +02001453#ifdef Py_DEBUG
1454 if (!check_maxchar
1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456 {
1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458 Py_UCS4 ch;
1459 Py_ssize_t i;
1460 for (i=0; i < how_many; i++) {
1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462 assert(ch <= to_maxchar);
1463 }
1464 }
1465#endif
1466
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001467 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001468 if (check_maxchar
1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001471 /* Writing Latin-1 characters into an ASCII string requires to
1472 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001473 Py_UCS4 max_char;
1474 max_char = ucs1lib_find_max_char(from_data,
1475 (Py_UCS1*)from_data + how_many);
1476 if (max_char >= 128)
1477 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001478 }
Christian Heimesf051e432016-09-13 20:22:02 +02001479 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001480 (char*)from_data + from_kind * from_start,
1481 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001483 else if (from_kind == PyUnicode_1BYTE_KIND
1484 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001485 {
1486 _PyUnicode_CONVERT_BYTES(
1487 Py_UCS1, Py_UCS2,
1488 PyUnicode_1BYTE_DATA(from) + from_start,
1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490 PyUnicode_2BYTE_DATA(to) + to_start
1491 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001492 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001493 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001494 && to_kind == PyUnicode_4BYTE_KIND)
1495 {
1496 _PyUnicode_CONVERT_BYTES(
1497 Py_UCS1, Py_UCS4,
1498 PyUnicode_1BYTE_DATA(from) + from_start,
1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500 PyUnicode_4BYTE_DATA(to) + to_start
1501 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001502 }
1503 else if (from_kind == PyUnicode_2BYTE_KIND
1504 && to_kind == PyUnicode_4BYTE_KIND)
1505 {
1506 _PyUnicode_CONVERT_BYTES(
1507 Py_UCS2, Py_UCS4,
1508 PyUnicode_2BYTE_DATA(from) + from_start,
1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510 PyUnicode_4BYTE_DATA(to) + to_start
1511 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001512 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001516 if (!check_maxchar) {
1517 if (from_kind == PyUnicode_2BYTE_KIND
1518 && to_kind == PyUnicode_1BYTE_KIND)
1519 {
1520 _PyUnicode_CONVERT_BYTES(
1521 Py_UCS2, Py_UCS1,
1522 PyUnicode_2BYTE_DATA(from) + from_start,
1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524 PyUnicode_1BYTE_DATA(to) + to_start
1525 );
1526 }
1527 else if (from_kind == PyUnicode_4BYTE_KIND
1528 && to_kind == PyUnicode_1BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS4, Py_UCS1,
1532 PyUnicode_4BYTE_DATA(from) + from_start,
1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_1BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_4BYTE_KIND
1538 && to_kind == PyUnicode_2BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS4, Py_UCS2,
1542 PyUnicode_4BYTE_DATA(from) + from_start,
1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_2BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001548 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001549 }
1550 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001551 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001554 Py_ssize_t i;
1555
Victor Stinnera0702ab2011-09-29 14:14:38 +02001556 for (i=0; i < how_many; i++) {
1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001558 if (ch > to_maxchar)
1559 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001562 }
1563 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return 0;
1565}
1566
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567void
1568_PyUnicode_FastCopyCharacters(
1569 PyObject *to, Py_ssize_t to_start,
1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571{
1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573}
1574
1575Py_ssize_t
1576PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577 PyObject *from, Py_ssize_t from_start,
1578 Py_ssize_t how_many)
1579{
1580 int err;
1581
1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586
Benjamin Petersonbac79492012-01-14 13:34:47 -05001587 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001588 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001589 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 return -1;
1591
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001593 PyErr_SetString(PyExc_IndexError, "string index out of range");
1594 return -1;
1595 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001597 PyErr_SetString(PyExc_IndexError, "string index out of range");
1598 return -1;
1599 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001600 if (how_many < 0) {
1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602 return -1;
1603 }
1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001607 "Cannot write %zi characters at %zi "
1608 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001609 how_many, to_start, PyUnicode_GET_LENGTH(to));
1610 return -1;
1611 }
1612
1613 if (how_many == 0)
1614 return 0;
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001617 return -1;
1618
1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620 if (err) {
1621 PyErr_Format(PyExc_SystemError,
1622 "Cannot copy %s characters "
1623 "into a string of %s characters",
1624 unicode_kind_name(from),
1625 unicode_kind_name(to));
1626 return -1;
1627 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001628 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629}
1630
Victor Stinner17222162011-09-28 22:15:37 +02001631/* Find the maximum code point and count the number of surrogate pairs so a
1632 correct string length can be computed before converting a string to UCS4.
1633 This function counts single surrogates as a character and not as a pair.
1634
1635 Return 0 on success, or -1 on error. */
1636static int
1637find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639{
1640 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001641 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642
Victor Stinnerc53be962011-10-02 21:33:54 +02001643 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 *num_surrogates = 0;
1645 *maxchar = 0;
1646
1647 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650 && (iter+1) < end
1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652 {
1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654 ++(*num_surrogates);
1655 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 }
1657 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001659 {
1660 ch = *iter;
1661 iter++;
1662 }
1663 if (ch > *maxchar) {
1664 *maxchar = ch;
1665 if (*maxchar > MAX_UNICODE) {
1666 PyErr_Format(PyExc_ValueError,
1667 "character U+%x is not in range [U+0000; U+10ffff]",
1668 ch);
1669 return -1;
1670 }
1671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 }
1673 return 0;
1674}
1675
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001676int
1677_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678{
1679 wchar_t *end;
1680 Py_UCS4 maxchar = 0;
1681 Py_ssize_t num_surrogates;
1682#if SIZEOF_WCHAR_T == 2
1683 Py_ssize_t length_wo_surrogates;
1684#endif
1685
Georg Brandl7597add2011-10-05 16:36:47 +02001686 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001687 strings were created using _PyObject_New() and where no canonical
1688 representation (the str field) has been set yet aka strings
1689 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001690 assert(_PyUnicode_CHECK(unicode));
1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001694 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001695 /* Actually, it should neither be interned nor be anything else: */
1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001700 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702
1703 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_1BYTE_DATA(unicode));
1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001716 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001721 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 }
1725 PyObject_FREE(_PyUnicode_WSTR(unicode));
1726 _PyUnicode_WSTR(unicode) = NULL;
1727 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728 }
1729 /* In this case we might have to convert down from 4-byte native
1730 wchar_t to 2-byte unicode. */
1731 else if (maxchar < 65536) {
1732 assert(num_surrogates == 0 &&
1733 "FindMaxCharAndNumSurrogatePairs() messed up");
1734
Victor Stinner506f5922011-09-28 22:34:18 +02001735#if SIZEOF_WCHAR_T == 2
1736 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001743#else
1744 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001748 PyErr_NoMemory();
1749 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 }
Victor Stinner506f5922011-09-28 22:34:18 +02001751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752 _PyUnicode_WSTR(unicode), end,
1753 PyUnicode_2BYTE_DATA(unicode));
1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001759 PyObject_FREE(_PyUnicode_WSTR(unicode));
1760 _PyUnicode_WSTR(unicode) = NULL;
1761 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765 else {
1766#if SIZEOF_WCHAR_T == 2
1767 /* in case the native representation is 2-bytes, we need to allocate a
1768 new normalized 4-byte version. */
1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771 PyErr_NoMemory();
1772 return -1;
1773 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 PyErr_NoMemory();
1777 return -1;
1778 }
1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001781 _PyUnicode_UTF8(unicode) = NULL;
1782 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001783 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 PyObject_FREE(_PyUnicode_WSTR(unicode));
1787 _PyUnicode_WSTR(unicode) = NULL;
1788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789#else
1790 assert(num_surrogates == 0);
1791
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797#endif
1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799 }
1800 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001801 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 return 0;
1803}
1804
Alexander Belopolsky40018472011-02-26 01:02:56 +00001805static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001806unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807{
Walter Dörwald16807132007-05-25 13:52:07 +00001808 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 case SSTATE_NOT_INTERNED:
1810 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001811
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 case SSTATE_INTERNED_MORTAL:
1813 /* revive dead object temporarily for DelItem */
1814 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001815 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 Py_FatalError(
1817 "deletion of interned string failed");
1818 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001819
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 case SSTATE_INTERNED_IMMORTAL:
1821 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001822 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001823
Benjamin Peterson29060642009-01-31 22:14:21 +00001824 default:
1825 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001826 }
1827
Victor Stinner03490912011-10-03 23:45:12 +02001828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001835 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836}
1837
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001838#ifdef Py_DEBUG
1839static int
1840unicode_is_singleton(PyObject *unicode)
1841{
1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843 if (unicode == unicode_empty)
1844 return 1;
1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846 {
1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848 if (ch < 256 && unicode_latin1[ch] == unicode)
1849 return 1;
1850 }
1851 return 0;
1852}
1853#endif
1854
Alexander Belopolsky40018472011-02-26 01:02:56 +00001855static int
Victor Stinner488fa492011-12-12 00:01:39 +01001856unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001857{
Victor Stinner488fa492011-12-12 00:01:39 +01001858 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001859 if (Py_REFCNT(unicode) != 1)
1860 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (_PyUnicode_HASH(unicode) != -1)
1862 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 if (PyUnicode_CHECK_INTERNED(unicode))
1864 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001865 if (!PyUnicode_CheckExact(unicode))
1866 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001867#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001868 /* singleton refcount is greater than 1 */
1869 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001870#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 return 1;
1872}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874static int
1875unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876{
1877 PyObject *unicode;
1878 Py_ssize_t old_length;
1879
1880 assert(p_unicode != NULL);
1881 unicode = *p_unicode;
1882
1883 assert(unicode != NULL);
1884 assert(PyUnicode_Check(unicode));
1885 assert(0 <= length);
1886
Victor Stinner910337b2011-10-03 03:20:16 +02001887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 old_length = PyUnicode_WSTR_LENGTH(unicode);
1889 else
1890 old_length = PyUnicode_GET_LENGTH(unicode);
1891 if (old_length == length)
1892 return 0;
1893
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001894 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001895 _Py_INCREF_UNICODE_EMPTY();
1896 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001898 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001899 return 0;
1900 }
1901
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001903 PyObject *copy = resize_copy(unicode, length);
1904 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001906 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001908 }
1909
Victor Stinnerfe226c02011-10-03 03:52:20 +02001910 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001911 PyObject *new_unicode = resize_compact(unicode, length);
1912 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001914 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001915 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001916 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001917 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001918}
1919
Alexander Belopolsky40018472011-02-26 01:02:56 +00001920int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001921PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001922{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001923 PyObject *unicode;
1924 if (p_unicode == NULL) {
1925 PyErr_BadInternalCall();
1926 return -1;
1927 }
1928 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001930 {
1931 PyErr_BadInternalCall();
1932 return -1;
1933 }
1934 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001935}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001936
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001937/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001938
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001939 WARNING: The function doesn't copy the terminating null character and
1940 doesn't check the maximum character (may write a latin1 character in an
1941 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001942static void
1943unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001945{
1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001948 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001949
1950 switch (kind) {
1951 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001952 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001953#ifdef Py_DEBUG
1954 if (PyUnicode_IS_ASCII(unicode)) {
1955 Py_UCS4 maxchar = ucs1lib_find_max_char(
1956 (const Py_UCS1*)str,
1957 (const Py_UCS1*)str + len);
1958 assert(maxchar < 128);
1959 }
1960#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001961 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001962 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001963 }
1964 case PyUnicode_2BYTE_KIND: {
1965 Py_UCS2 *start = (Py_UCS2 *)data + index;
1966 Py_UCS2 *ucs2 = start;
1967 assert(index <= PyUnicode_GET_LENGTH(unicode));
1968
Victor Stinner184252a2012-06-16 02:57:41 +02001969 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001970 *ucs2 = (Py_UCS2)*str;
1971
1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001973 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001974 }
1975 default: {
1976 Py_UCS4 *start = (Py_UCS4 *)data + index;
1977 Py_UCS4 *ucs4 = start;
1978 assert(kind == PyUnicode_4BYTE_KIND);
1979 assert(index <= PyUnicode_GET_LENGTH(unicode));
1980
Victor Stinner184252a2012-06-16 02:57:41 +02001981 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982 *ucs4 = (Py_UCS4)*str;
1983
1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001985 }
1986 }
1987}
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989static PyObject*
1990get_latin1_char(unsigned char ch)
1991{
Victor Stinnera464fc12011-10-02 20:39:30 +02001992 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001994 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 if (!unicode)
1996 return NULL;
1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 unicode_latin1[ch] = unicode;
2000 }
2001 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002002 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003}
2004
Victor Stinner985a82a2014-01-03 12:53:47 +01002005static PyObject*
2006unicode_char(Py_UCS4 ch)
2007{
2008 PyObject *unicode;
2009
2010 assert(ch <= MAX_UNICODE);
2011
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002012 if (ch < 256)
2013 return get_latin1_char(ch);
2014
Victor Stinner985a82a2014-01-03 12:53:47 +01002015 unicode = PyUnicode_New(1, ch);
2016 if (unicode == NULL)
2017 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002018
2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002022 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025 }
2026 assert(_PyUnicode_CheckConsistency(unicode, 1));
2027 return unicode;
2028}
2029
Alexander Belopolsky40018472011-02-26 01:02:56 +00002030PyObject *
2031PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002033 if (u == NULL)
2034 return (PyObject*)_PyUnicode_New(size);
2035
2036 if (size < 0) {
2037 PyErr_BadInternalCall();
2038 return NULL;
2039 }
2040
2041 return PyUnicode_FromWideChar(u, size);
2042}
2043
2044PyObject *
2045PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002047 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 Py_UCS4 maxchar = 0;
2049 Py_ssize_t num_surrogates;
2050
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002051 if (u == NULL && size != 0) {
2052 PyErr_BadInternalCall();
2053 return NULL;
2054 }
2055
2056 if (size == -1) {
2057 size = wcslen(u);
2058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002060 /* If the Unicode data is known at construction time, we can apply
2061 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002064 if (size == 0)
2065 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 /* Single character Unicode objects in the Latin-1 range are
2068 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002069 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 return get_latin1_char((unsigned char)*u);
2071
2072 /* If not empty and not single character, copy the Unicode data
2073 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002074 if (find_maxchar_surrogates(u, u + size,
2075 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 return NULL;
2077
Victor Stinner8faf8212011-12-08 22:14:11 +01002078 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 if (!unicode)
2080 return NULL;
2081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 switch (PyUnicode_KIND(unicode)) {
2083 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086 break;
2087 case PyUnicode_2BYTE_KIND:
2088#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093#endif
2094 break;
2095 case PyUnicode_4BYTE_KIND:
2096#if SIZEOF_WCHAR_T == 2
2097 /* This is the only case which has to process surrogates, thus
2098 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002099 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100#else
2101 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103#endif
2104 break;
2105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002106 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002109 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110}
2111
Alexander Belopolsky40018472011-02-26 01:02:56 +00002112PyObject *
2113PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002114{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 if (size < 0) {
2116 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002117 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 return NULL;
2119 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002120 if (u != NULL)
2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122 else
2123 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002128{
2129 size_t size = strlen(u);
2130 if (size > PY_SSIZE_T_MAX) {
2131 PyErr_SetString(PyExc_OverflowError, "input too long");
2132 return NULL;
2133 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002135}
2136
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137PyObject *
2138_PyUnicode_FromId(_Py_Identifier *id)
2139{
2140 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002141 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142 strlen(id->string),
2143 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002144 if (!id->object)
2145 return NULL;
2146 PyUnicode_InternInPlace(&id->object);
2147 assert(!id->next);
2148 id->next = static_strings;
2149 static_strings = id;
2150 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002151 return id->object;
2152}
2153
2154void
2155_PyUnicode_ClearStaticStrings()
2156{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002157 _Py_Identifier *tmp, *s = static_strings;
2158 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002159 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002160 tmp = s->next;
2161 s->next = NULL;
2162 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002163 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002164 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002165}
2166
Benjamin Peterson0df54292012-03-26 14:50:32 -04002167/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002168
Victor Stinnerd3f08822012-05-29 12:57:52 +02002169PyObject*
2170_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002171{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002172 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002173 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002174 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002175#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002176 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002177#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002178 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002179 }
Victor Stinner785938e2011-12-11 20:09:03 +01002180 unicode = PyUnicode_New(size, 127);
2181 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002182 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184 assert(_PyUnicode_CheckConsistency(unicode, 1));
2185 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002186}
2187
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002188static Py_UCS4
2189kind_maxchar_limit(unsigned int kind)
2190{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002191 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002192 case PyUnicode_1BYTE_KIND:
2193 return 0x80;
2194 case PyUnicode_2BYTE_KIND:
2195 return 0x100;
2196 case PyUnicode_4BYTE_KIND:
2197 return 0x10000;
2198 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002199 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002200 }
2201}
2202
Victor Stinner702c7342011-10-05 13:50:52 +02002203static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002204_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002207 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002208
Serhiy Storchaka678db842013-01-26 12:16:36 +02002209 if (size == 0)
2210 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002212 if (size == 1)
2213 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002215 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002216 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (!res)
2218 return NULL;
2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002220 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002222}
2223
Victor Stinnere57b1c02011-09-28 22:20:48 +02002224static PyObject*
2225_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226{
2227 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002228 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229
Serhiy Storchaka678db842013-01-26 12:16:36 +02002230 if (size == 0)
2231 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002232 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002233 if (size == 1)
2234 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002235
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002236 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002237 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 if (!res)
2239 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002240 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002242 else {
2243 _PyUnicode_CONVERT_BYTES(
2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002246 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return res;
2248}
2249
Victor Stinnere57b1c02011-09-28 22:20:48 +02002250static PyObject*
2251_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252{
2253 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002254 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255
Serhiy Storchaka678db842013-01-26 12:16:36 +02002256 if (size == 0)
2257 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002258 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002259 if (size == 1)
2260 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002261
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002262 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002263 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 if (!res)
2265 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002266 if (max_char < 256)
2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268 PyUnicode_1BYTE_DATA(res));
2269 else if (max_char < 0x10000)
2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271 PyUnicode_2BYTE_DATA(res));
2272 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002274 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return res;
2276}
2277
2278PyObject*
2279PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002281 if (size < 0) {
2282 PyErr_SetString(PyExc_ValueError, "size must be positive");
2283 return NULL;
2284 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002285 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002289 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002291 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002293 PyErr_SetString(PyExc_SystemError, "invalid kind");
2294 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296}
2297
Victor Stinnerece58de2012-04-23 23:36:38 +02002298Py_UCS4
2299_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300{
2301 enum PyUnicode_Kind kind;
2302 void *startptr, *endptr;
2303
2304 assert(PyUnicode_IS_READY(unicode));
2305 assert(0 <= start);
2306 assert(end <= PyUnicode_GET_LENGTH(unicode));
2307 assert(start <= end);
2308
2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310 return PyUnicode_MAX_CHAR_VALUE(unicode);
2311
2312 if (start == end)
2313 return 127;
2314
Victor Stinner94d558b2012-04-27 22:26:58 +02002315 if (PyUnicode_IS_ASCII(unicode))
2316 return 127;
2317
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002319 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002320 endptr = (char *)startptr + end * kind;
2321 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002322 switch(kind) {
2323 case PyUnicode_1BYTE_KIND:
2324 return ucs1lib_find_max_char(startptr, endptr);
2325 case PyUnicode_2BYTE_KIND:
2326 return ucs2lib_find_max_char(startptr, endptr);
2327 case PyUnicode_4BYTE_KIND:
2328 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002330 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002331 }
2332}
2333
Victor Stinner25a4b292011-10-06 12:31:55 +02002334/* Ensure that a string uses the most efficient storage, if it is not the
2335 case: create a new string with of the right kind. Write NULL into *p_unicode
2336 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002337static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002338unicode_adjust_maxchar(PyObject **p_unicode)
2339{
2340 PyObject *unicode, *copy;
2341 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002342 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002343 unsigned int kind;
2344
2345 assert(p_unicode != NULL);
2346 unicode = *p_unicode;
2347 assert(PyUnicode_IS_READY(unicode));
2348 if (PyUnicode_IS_ASCII(unicode))
2349 return;
2350
2351 len = PyUnicode_GET_LENGTH(unicode);
2352 kind = PyUnicode_KIND(unicode);
2353 if (kind == PyUnicode_1BYTE_KIND) {
2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 max_char = ucs1lib_find_max_char(u, u + len);
2356 if (max_char >= 128)
2357 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 }
2359 else if (kind == PyUnicode_2BYTE_KIND) {
2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002361 max_char = ucs2lib_find_max_char(u, u + len);
2362 if (max_char >= 256)
2363 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 }
2365 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002367 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002368 max_char = ucs4lib_find_max_char(u, u + len);
2369 if (max_char >= 0x10000)
2370 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002372 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002373 if (copy != NULL)
2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002375 Py_DECREF(unicode);
2376 *p_unicode = copy;
2377}
2378
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002380_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002381{
Victor Stinner87af4f22011-11-21 23:03:47 +01002382 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002384
Victor Stinner034f6cf2011-09-30 02:26:44 +02002385 if (!PyUnicode_Check(unicode)) {
2386 PyErr_BadInternalCall();
2387 return NULL;
2388 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002389 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002390 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002391
Victor Stinner87af4f22011-11-21 23:03:47 +01002392 length = PyUnicode_GET_LENGTH(unicode);
2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002394 if (!copy)
2395 return NULL;
2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397
Christian Heimesf051e432016-09-13 20:22:02 +02002398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002399 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002400 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002401 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002402}
2403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405/* Widen Unicode objects to larger buffers. Don't write terminating null
2406 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407
2408void*
2409_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002411 Py_ssize_t len;
2412 void *result;
2413 unsigned int skind;
2414
Benjamin Petersonbac79492012-01-14 13:34:47 -05002415 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 return NULL;
2417
2418 len = PyUnicode_GET_LENGTH(s);
2419 skind = PyUnicode_KIND(s);
2420 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return NULL;
2423 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002424 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 assert(skind == PyUnicode_1BYTE_KIND);
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS1, Py_UCS2,
2432 PyUnicode_1BYTE_DATA(s),
2433 PyUnicode_1BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002437 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002438 if (!result)
2439 return PyErr_NoMemory();
2440 if (skind == PyUnicode_2BYTE_KIND) {
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS2, Py_UCS4,
2443 PyUnicode_2BYTE_DATA(s),
2444 PyUnicode_2BYTE_DATA(s) + len,
2445 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002447 else {
2448 assert(skind == PyUnicode_1BYTE_KIND);
2449 _PyUnicode_CONVERT_BYTES(
2450 Py_UCS1, Py_UCS4,
2451 PyUnicode_1BYTE_DATA(s),
2452 PyUnicode_1BYTE_DATA(s) + len,
2453 result);
2454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002456 default:
2457 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 }
Victor Stinner01698042011-10-04 00:04:26 +02002459 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 return NULL;
2461}
2462
2463static Py_UCS4*
2464as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
2467 int kind;
2468 void *data;
2469 Py_ssize_t len, targetlen;
2470 if (PyUnicode_READY(string) == -1)
2471 return NULL;
2472 kind = PyUnicode_KIND(string);
2473 data = PyUnicode_DATA(string);
2474 len = PyUnicode_GET_LENGTH(string);
2475 targetlen = len;
2476 if (copy_null)
2477 targetlen++;
2478 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002479 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 if (!target) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
2484 }
2485 else {
2486 if (targetsize < targetlen) {
2487 PyErr_Format(PyExc_SystemError,
2488 "string is longer than the buffer");
2489 if (copy_null && 0 < targetsize)
2490 target[0] = 0;
2491 return NULL;
2492 }
2493 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 if (kind == PyUnicode_1BYTE_KIND) {
2495 Py_UCS1 *start = (Py_UCS1 *) data;
2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002498 else if (kind == PyUnicode_2BYTE_KIND) {
2499 Py_UCS2 *start = (Py_UCS2 *) data;
2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501 }
2502 else {
2503 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002504 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (copy_null)
2507 target[len] = 0;
2508 return target;
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513 int copy_null)
2514{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002515 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 PyErr_BadInternalCall();
2517 return NULL;
2518 }
2519 return as_ucs4(string, target, targetsize, copy_null);
2520}
2521
2522Py_UCS4*
2523PyUnicode_AsUCS4Copy(PyObject *string)
2524{
2525 return as_ucs4(string, NULL, 0, 1);
2526}
2527
Victor Stinner15a11362012-10-06 23:48:20 +02002528/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2531#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002532
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533static int
2534unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535 Py_ssize_t width, Py_ssize_t precision)
2536{
2537 Py_ssize_t length, fill, arglen;
2538 Py_UCS4 maxchar;
2539
2540 if (PyUnicode_READY(str) == -1)
2541 return -1;
2542
2543 length = PyUnicode_GET_LENGTH(str);
2544 if ((precision == -1 || precision >= length)
2545 && width <= length)
2546 return _PyUnicodeWriter_WriteStr(writer, str);
2547
2548 if (precision != -1)
2549 length = Py_MIN(precision, length);
2550
2551 arglen = Py_MAX(length, width);
2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554 else
2555 maxchar = writer->maxchar;
2556
2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558 return -1;
2559
2560 if (width > length) {
2561 fill = width - length;
2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563 return -1;
2564 writer->pos += fill;
2565 }
2566
2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568 str, 0, length);
2569 writer->pos += length;
2570 return 0;
2571}
2572
2573static int
2574unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575 Py_ssize_t width, Py_ssize_t precision)
2576{
2577 /* UTF-8 */
2578 Py_ssize_t length;
2579 PyObject *unicode;
2580 int res;
2581
2582 length = strlen(str);
2583 if (precision != -1)
2584 length = Py_MIN(length, precision);
2585 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2586 if (unicode == NULL)
2587 return -1;
2588
2589 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2590 Py_DECREF(unicode);
2591 return res;
2592}
2593
Victor Stinner96865452011-03-01 23:44:09 +00002594static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002595unicode_fromformat_arg(_PyUnicodeWriter *writer,
2596 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002597{
Victor Stinnere215d962012-10-06 23:03:36 +02002598 const char *p;
2599 Py_ssize_t len;
2600 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 Py_ssize_t width;
2602 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002603 int longflag;
2604 int longlongflag;
2605 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002607
2608 p = f;
2609 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002610 zeropad = 0;
2611 if (*f == '0') {
2612 zeropad = 1;
2613 f++;
2614 }
Victor Stinner96865452011-03-01 23:44:09 +00002615
2616 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 width = -1;
2618 if (Py_ISDIGIT((unsigned)*f)) {
2619 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002620 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002623 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002624 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002625 return NULL;
2626 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002628 f++;
2629 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 }
2631 precision = -1;
2632 if (*f == '.') {
2633 f++;
2634 if (Py_ISDIGIT((unsigned)*f)) {
2635 precision = (*f - '0');
2636 f++;
2637 while (Py_ISDIGIT((unsigned)*f)) {
2638 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2639 PyErr_SetString(PyExc_ValueError,
2640 "precision too big");
2641 return NULL;
2642 }
2643 precision = (precision * 10) + (*f - '0');
2644 f++;
2645 }
2646 }
Victor Stinner96865452011-03-01 23:44:09 +00002647 if (*f == '%') {
2648 /* "%.3%s" => f points to "3" */
2649 f--;
2650 }
2651 }
2652 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002653 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002654 f--;
2655 }
Victor Stinner96865452011-03-01 23:44:09 +00002656
2657 /* Handle %ld, %lu, %lld and %llu. */
2658 longflag = 0;
2659 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002660 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002661 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002663 longflag = 1;
2664 ++f;
2665 }
Victor Stinner96865452011-03-01 23:44:09 +00002666 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longlongflag = 1;
2669 f += 2;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 }
2672 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002673 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002674 size_tflag = 1;
2675 ++f;
2676 }
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (f[1] == '\0')
2679 writer->overallocate = 0;
2680
2681 switch (*f) {
2682 case 'c':
2683 {
2684 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002685 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002686 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002687 "character argument not in range(0x110000)");
2688 return NULL;
2689 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002690 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002691 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002692 break;
2693 }
2694
2695 case 'i':
2696 case 'd':
2697 case 'u':
2698 case 'x':
2699 {
2700 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002701 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002702 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002703
2704 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002705 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002708 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002709 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002710 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002711 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, size_t));
2714 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002715 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002716 va_arg(*vargs, unsigned int));
2717 }
2718 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002719 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002720 }
2721 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002723 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002726 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002727 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002728 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002729 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002730 va_arg(*vargs, Py_ssize_t));
2731 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002732 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002733 va_arg(*vargs, int));
2734 }
2735 assert(len >= 0);
2736
Victor Stinnere215d962012-10-06 23:03:36 +02002737 if (precision < len)
2738 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002739
2740 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002741 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2742 return NULL;
2743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 if (width > precision) {
2745 Py_UCS4 fillchar;
2746 fill = width - precision;
2747 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002748 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2749 return NULL;
2750 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002751 }
Victor Stinner15a11362012-10-06 23:48:20 +02002752 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002753 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2755 return NULL;
2756 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002757 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002758
Victor Stinner4a587072013-11-19 12:54:53 +01002759 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2760 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002761 break;
2762 }
2763
2764 case 'p':
2765 {
2766 char number[MAX_LONG_LONG_CHARS];
2767
2768 len = sprintf(number, "%p", va_arg(*vargs, void*));
2769 assert(len >= 0);
2770
2771 /* %p is ill-defined: ensure leading 0x. */
2772 if (number[1] == 'X')
2773 number[1] = 'x';
2774 else if (number[1] != 'x') {
2775 memmove(number + 2, number,
2776 strlen(number) + 1);
2777 number[0] = '0';
2778 number[1] = 'x';
2779 len += 2;
2780 }
2781
Victor Stinner4a587072013-11-19 12:54:53 +01002782 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002783 return NULL;
2784 break;
2785 }
2786
2787 case 's':
2788 {
2789 /* UTF-8 */
2790 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 break;
2794 }
2795
2796 case 'U':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 assert(obj && _PyUnicode_CHECK(obj));
2800
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 break;
2804 }
2805
2806 case 'V':
2807 {
2808 PyObject *obj = va_arg(*vargs, PyObject *);
2809 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002810 if (obj) {
2811 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002813 return NULL;
2814 }
2815 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002816 assert(str != NULL);
2817 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002819 }
2820 break;
2821 }
2822
2823 case 'S':
2824 {
2825 PyObject *obj = va_arg(*vargs, PyObject *);
2826 PyObject *str;
2827 assert(obj);
2828 str = PyObject_Str(obj);
2829 if (!str)
2830 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002831 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002832 Py_DECREF(str);
2833 return NULL;
2834 }
2835 Py_DECREF(str);
2836 break;
2837 }
2838
2839 case 'R':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 PyObject *repr;
2843 assert(obj);
2844 repr = PyObject_Repr(obj);
2845 if (!repr)
2846 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002847 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002848 Py_DECREF(repr);
2849 return NULL;
2850 }
2851 Py_DECREF(repr);
2852 break;
2853 }
2854
2855 case 'A':
2856 {
2857 PyObject *obj = va_arg(*vargs, PyObject *);
2858 PyObject *ascii;
2859 assert(obj);
2860 ascii = PyObject_ASCII(obj);
2861 if (!ascii)
2862 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002863 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002864 Py_DECREF(ascii);
2865 return NULL;
2866 }
2867 Py_DECREF(ascii);
2868 break;
2869 }
2870
2871 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002872 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002873 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002874 break;
2875
2876 default:
2877 /* if we stumble upon an unknown formatting code, copy the rest
2878 of the format string to the output string. (we cannot just
2879 skip the code, since there's no way to know what's in the
2880 argument list) */
2881 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002882 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002883 return NULL;
2884 f = p+len;
2885 return f;
2886 }
2887
2888 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002889 return f;
2890}
2891
Walter Dörwaldd2034312007-05-18 16:29:38 +00002892PyObject *
2893PyUnicode_FromFormatV(const char *format, va_list vargs)
2894{
Victor Stinnere215d962012-10-06 23:03:36 +02002895 va_list vargs2;
2896 const char *f;
2897 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002898
Victor Stinner8f674cc2013-04-17 23:02:17 +02002899 _PyUnicodeWriter_Init(&writer);
2900 writer.min_length = strlen(format) + 100;
2901 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002902
Benjamin Peterson0c212142016-09-20 20:39:33 -07002903 // Copy varags to be able to pass a reference to a subfunction.
2904 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002905
2906 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002908 f = unicode_fromformat_arg(&writer, f, &vargs2);
2909 if (f == NULL)
2910 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 const char *p;
2914 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002915
Victor Stinnere215d962012-10-06 23:03:36 +02002916 p = f;
2917 do
2918 {
2919 if ((unsigned char)*p > 127) {
2920 PyErr_Format(PyExc_ValueError,
2921 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2922 "string, got a non-ASCII byte: 0x%02x",
2923 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925 }
2926 p++;
2927 }
2928 while (*p != '\0' && *p != '%');
2929 len = p - f;
2930
2931 if (*p == '\0')
2932 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002933
2934 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002935 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002936
2937 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002940 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002941 return _PyUnicodeWriter_Finish(&writer);
2942
2943 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002944 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002945 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002946 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002947}
2948
Walter Dörwaldd2034312007-05-18 16:29:38 +00002949PyObject *
2950PyUnicode_FromFormat(const char *format, ...)
2951{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 PyObject* ret;
2953 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954
2955#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002956 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002957#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002958 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002960 ret = PyUnicode_FromFormatV(format, vargs);
2961 va_end(vargs);
2962 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002963}
2964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965#ifdef HAVE_WCHAR_H
2966
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002967/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002968
Victor Stinnerd88d9832011-09-06 02:00:05 +02002969 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 character) required to convert the unicode object. Ignore size argument.
2971
Victor Stinnerd88d9832011-09-06 02:00:05 +02002972 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002973 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002974 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002975Py_ssize_t
2976PyUnicode_AsWideChar(PyObject *unicode,
2977 wchar_t *w,
2978 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002979{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002980 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 const wchar_t *wstr;
2982
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002983 if (unicode == NULL) {
2984 PyErr_BadInternalCall();
2985 return -1;
2986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002988 if (wstr == NULL)
2989 return -1;
2990
Victor Stinner5593d8a2010-10-02 11:11:27 +00002991 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002992 if (size > res)
2993 size = res + 1;
2994 else
2995 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002996 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002997 return res;
2998 }
2999 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003000 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00003001}
3002
Victor Stinner137c34c2010-09-29 10:25:54 +00003003wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003004PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003005 Py_ssize_t *size)
3006{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003007 const wchar_t *wstr;
3008 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003009 Py_ssize_t buflen;
3010
3011 if (unicode == NULL) {
3012 PyErr_BadInternalCall();
3013 return NULL;
3014 }
3015
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003016 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3017 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003018 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003019 }
3020 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3021 PyErr_SetString(PyExc_ValueError,
3022 "embedded null character");
3023 return NULL;
3024 }
3025
3026 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003027 if (buffer == NULL) {
3028 PyErr_NoMemory();
3029 return NULL;
3030 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003031 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003032 if (size != NULL)
3033 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003034 return buffer;
3035}
3036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038
Alexander Belopolsky40018472011-02-26 01:02:56 +00003039PyObject *
3040PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003041{
Victor Stinner8faf8212011-12-08 22:14:11 +01003042 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 PyErr_SetString(PyExc_ValueError,
3044 "chr() arg not in range(0x110000)");
3045 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003046 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003047
Victor Stinner985a82a2014-01-03 12:53:47 +01003048 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003052PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003054 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003057 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003058 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 Py_INCREF(obj);
3060 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003061 }
3062 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003063 /* For a Unicode subtype that's not a Unicode object,
3064 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003065 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003066 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003067 PyErr_Format(PyExc_TypeError,
3068 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003069 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003070 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071}
3072
Alexander Belopolsky40018472011-02-26 01:02:56 +00003073PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003074PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003075 const char *encoding,
3076 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003080
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 PyErr_BadInternalCall();
3083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 /* Decoding bytes objects is the most common case and should be fast */
3087 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003088 if (PyBytes_GET_SIZE(obj) == 0)
3089 _Py_RETURN_UNICODE_EMPTY();
3090 v = PyUnicode_Decode(
3091 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3092 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 return v;
3094 }
3095
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003096 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 PyErr_SetString(PyExc_TypeError,
3098 "decoding str is not supported");
3099 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003100 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003101
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003102 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3103 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3104 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003105 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003106 Py_TYPE(obj)->tp_name);
3107 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003108 }
Tim Petersced69f82003-09-16 20:30:58 +00003109
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003111 PyBuffer_Release(&buffer);
3112 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003114
Serhiy Storchaka05997252013-01-26 12:14:02 +02003115 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003116 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003117 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118}
3119
Victor Stinnerebe17e02016-10-12 13:57:45 +02003120/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3121 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3122 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003123int
3124_Py_normalize_encoding(const char *encoding,
3125 char *lower,
3126 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003128 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003129 char *l;
3130 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003131 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132
Victor Stinner942889a2016-09-05 15:40:10 -07003133 assert(encoding != NULL);
3134
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 e = encoding;
3136 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003137 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003138 punct = 0;
3139 while (1) {
3140 char c = *e;
3141 if (c == 0) {
3142 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003143 }
Victor Stinner942889a2016-09-05 15:40:10 -07003144
3145 if (Py_ISALNUM(c) || c == '.') {
3146 if (punct && l != lower) {
3147 if (l == l_end) {
3148 return 0;
3149 }
3150 *l++ = '_';
3151 }
3152 punct = 0;
3153
3154 if (l == l_end) {
3155 return 0;
3156 }
3157 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003160 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003161 }
Victor Stinner942889a2016-09-05 15:40:10 -07003162
3163 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003164 }
3165 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003166 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003167}
3168
Alexander Belopolsky40018472011-02-26 01:02:56 +00003169PyObject *
3170PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003171 Py_ssize_t size,
3172 const char *encoding,
3173 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003174{
3175 PyObject *buffer = NULL, *unicode;
3176 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003177 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3178
3179 if (encoding == NULL) {
3180 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3181 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003182
Fred Drakee4315f52000-05-09 19:53:39 +00003183 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003184 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3185 char *lower = buflower;
3186
3187 /* Fast paths */
3188 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3189 lower += 3;
3190 if (*lower == '_') {
3191 /* Match "utf8" and "utf_8" */
3192 lower++;
3193 }
3194
3195 if (lower[0] == '8' && lower[1] == 0) {
3196 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3197 }
3198 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3199 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3200 }
3201 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3203 }
3204 }
3205 else {
3206 if (strcmp(lower, "ascii") == 0
3207 || strcmp(lower, "us_ascii") == 0) {
3208 return PyUnicode_DecodeASCII(s, size, errors);
3209 }
Steve Dowercc16be82016-09-08 10:35:16 -07003210 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003211 else if (strcmp(lower, "mbcs") == 0) {
3212 return PyUnicode_DecodeMBCS(s, size, errors);
3213 }
3214 #endif
3215 else if (strcmp(lower, "latin1") == 0
3216 || strcmp(lower, "latin_1") == 0
3217 || strcmp(lower, "iso_8859_1") == 0
3218 || strcmp(lower, "iso8859_1") == 0) {
3219 return PyUnicode_DecodeLatin1(s, size, errors);
3220 }
3221 }
Victor Stinner37296e82010-06-10 13:36:23 +00003222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223
3224 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003225 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003226 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003227 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003228 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 if (buffer == NULL)
3230 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003231 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 if (unicode == NULL)
3233 goto onError;
3234 if (!PyUnicode_Check(unicode)) {
3235 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003236 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3237 "use codecs.decode() to decode to arbitrary types",
3238 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003239 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 Py_DECREF(unicode);
3241 goto onError;
3242 }
3243 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003244 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003245
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 Py_XDECREF(buffer);
3248 return NULL;
3249}
3250
Alexander Belopolsky40018472011-02-26 01:02:56 +00003251PyObject *
3252PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003253 const char *encoding,
3254 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003256 if (!PyUnicode_Check(unicode)) {
3257 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003258 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003259 }
3260
Serhiy Storchaka00939072016-10-27 21:05:49 +03003261 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3262 "PyUnicode_AsDecodedObject() is deprecated; "
3263 "use PyCodec_Decode() to decode from str", 1) < 0)
3264 return NULL;
3265
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003268
3269 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003270 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003271}
3272
Alexander Belopolsky40018472011-02-26 01:02:56 +00003273PyObject *
3274PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003275 const char *encoding,
3276 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003277{
3278 PyObject *v;
3279
3280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
3282 goto onError;
3283 }
3284
Serhiy Storchaka00939072016-10-27 21:05:49 +03003285 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3286 "PyUnicode_AsDecodedUnicode() is deprecated; "
3287 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3288 return NULL;
3289
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292
3293 /* Decode via the codec registry */
3294 v = PyCodec_Decode(unicode, encoding, errors);
3295 if (v == NULL)
3296 goto onError;
3297 if (!PyUnicode_Check(v)) {
3298 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003299 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3300 "use codecs.decode() to decode to arbitrary types",
3301 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003302 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 Py_DECREF(v);
3304 goto onError;
3305 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003306 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003307
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003309 return NULL;
3310}
3311
Alexander Belopolsky40018472011-02-26 01:02:56 +00003312PyObject *
3313PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003314 Py_ssize_t size,
3315 const char *encoding,
3316 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317{
3318 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003319
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003320 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3324 Py_DECREF(unicode);
3325 return v;
3326}
3327
Alexander Belopolsky40018472011-02-26 01:02:56 +00003328PyObject *
3329PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003330 const char *encoding,
3331 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003332{
3333 PyObject *v;
3334
3335 if (!PyUnicode_Check(unicode)) {
3336 PyErr_BadArgument();
3337 goto onError;
3338 }
3339
Serhiy Storchaka00939072016-10-27 21:05:49 +03003340 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3341 "PyUnicode_AsEncodedObject() is deprecated; "
3342 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3343 "or PyCodec_Encode() for generic encoding", 1) < 0)
3344 return NULL;
3345
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003348
3349 /* Encode via the codec registry */
3350 v = PyCodec_Encode(unicode, encoding, errors);
3351 if (v == NULL)
3352 goto onError;
3353 return v;
3354
Benjamin Peterson29060642009-01-31 22:14:21 +00003355 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003356 return NULL;
3357}
3358
Victor Stinner1b579672011-12-17 05:47:23 +01003359static int
3360locale_error_handler(const char *errors, int *surrogateescape)
3361{
Victor Stinner50149202015-09-22 00:26:54 +02003362 _Py_error_handler error_handler = get_error_handler(errors);
3363 switch (error_handler)
3364 {
3365 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003366 *surrogateescape = 0;
3367 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003368 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003369 *surrogateescape = 1;
3370 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003371 default:
3372 PyErr_Format(PyExc_ValueError,
3373 "only 'strict' and 'surrogateescape' error handlers "
3374 "are supported, not '%s'",
3375 errors);
3376 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003377 }
Victor Stinner1b579672011-12-17 05:47:23 +01003378}
3379
Victor Stinner2cba6b82018-01-10 22:46:15 +01003380static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003381unicode_encode_locale(PyObject *unicode, const char *errors,
3382 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383{
Victor Stinner1b579672011-12-17 05:47:23 +01003384 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003385 if (locale_error_handler(errors, &surrogateescape) < 0)
3386 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003388 Py_ssize_t wlen;
3389 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3390 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003392 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393
Victor Stinner85ab9742018-11-28 12:42:40 +01003394 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003395 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner85ab9742018-11-28 12:42:40 +01003396 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 return NULL;
3398 }
3399
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003400 char *str;
3401 size_t error_pos;
3402 const char *reason;
3403 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3404 current_locale, surrogateescape);
Victor Stinner85ab9742018-11-28 12:42:40 +01003405 PyMem_Free(wstr);
3406
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003407 if (res != 0) {
3408 if (res == -2) {
3409 PyObject *exc;
3410 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3411 "locale", unicode,
3412 (Py_ssize_t)error_pos,
3413 (Py_ssize_t)(error_pos+1),
3414 reason);
3415 if (exc != NULL) {
3416 PyCodec_StrictErrors(exc);
3417 Py_DECREF(exc);
3418 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003419 }
3420 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003421 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 }
Victor Stinner85ab9742018-11-28 12:42:40 +01003423 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003426 PyObject *bytes = PyBytes_FromString(str);
3427 PyMem_RawFree(str);
3428 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003429}
3430
Victor Stinnerad158722010-10-27 00:25:46 +00003431PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003432PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3433{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003434 return unicode_encode_locale(unicode, errors, 1);
3435}
3436
3437PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003438PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003439{
Steve Dowercc16be82016-09-08 10:35:16 -07003440#if defined(__APPLE__)
3441 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003442#else
Victor Stinner793b5312011-04-27 00:24:21 +02003443 PyInterpreterState *interp = PyThreadState_GET()->interp;
3444 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3445 cannot use it to encode and decode filenames before it is loaded. Load
3446 the Python codec requires to encode at least its own filename. Use the C
3447 version of the locale codec until the codec registry is initialized and
3448 the Python codec is loaded.
3449
3450 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3451 cannot only rely on it: check also interp->fscodec_initialized for
3452 subinterpreters. */
3453 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003454 return PyUnicode_AsEncodedString(unicode,
3455 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003456 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003457 }
3458 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003459 return unicode_encode_locale(unicode,
3460 Py_FileSystemDefaultEncodeErrors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003461 }
Victor Stinnerad158722010-10-27 00:25:46 +00003462#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003463}
3464
Alexander Belopolsky40018472011-02-26 01:02:56 +00003465PyObject *
3466PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003467 const char *encoding,
3468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469{
3470 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003471 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003472
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 if (!PyUnicode_Check(unicode)) {
3474 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 }
Fred Drakee4315f52000-05-09 19:53:39 +00003477
Victor Stinner942889a2016-09-05 15:40:10 -07003478 if (encoding == NULL) {
3479 return _PyUnicode_AsUTF8String(unicode, errors);
3480 }
3481
Fred Drakee4315f52000-05-09 19:53:39 +00003482 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003483 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3484 char *lower = buflower;
3485
3486 /* Fast paths */
3487 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3488 lower += 3;
3489 if (*lower == '_') {
3490 /* Match "utf8" and "utf_8" */
3491 lower++;
3492 }
3493
3494 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003496 }
3497 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3498 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3499 }
3500 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3501 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3502 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003503 }
Victor Stinner942889a2016-09-05 15:40:10 -07003504 else {
3505 if (strcmp(lower, "ascii") == 0
3506 || strcmp(lower, "us_ascii") == 0) {
3507 return _PyUnicode_AsASCIIString(unicode, errors);
3508 }
Steve Dowercc16be82016-09-08 10:35:16 -07003509#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003510 else if (strcmp(lower, "mbcs") == 0) {
3511 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3512 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003513#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003514 else if (strcmp(lower, "latin1") == 0 ||
3515 strcmp(lower, "latin_1") == 0 ||
3516 strcmp(lower, "iso_8859_1") == 0 ||
3517 strcmp(lower, "iso8859_1") == 0) {
3518 return _PyUnicode_AsLatin1String(unicode, errors);
3519 }
3520 }
Victor Stinner37296e82010-06-10 13:36:23 +00003521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522
3523 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003524 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003526 return NULL;
3527
3528 /* The normal path */
3529 if (PyBytes_Check(v))
3530 return v;
3531
3532 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003533 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003534 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003535 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003536
3537 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003538 "encoder %s returned bytearray instead of bytes; "
3539 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003540 encoding);
3541 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003542 Py_DECREF(v);
3543 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003545
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003546 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3547 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003548 Py_DECREF(v);
3549 return b;
3550 }
3551
3552 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003553 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3554 "use codecs.encode() to encode to arbitrary types",
3555 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003556 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003558 return NULL;
3559}
3560
Alexander Belopolsky40018472011-02-26 01:02:56 +00003561PyObject *
3562PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003563 const char *encoding,
3564 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003565{
3566 PyObject *v;
3567
3568 if (!PyUnicode_Check(unicode)) {
3569 PyErr_BadArgument();
3570 goto onError;
3571 }
3572
Serhiy Storchaka00939072016-10-27 21:05:49 +03003573 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3574 "PyUnicode_AsEncodedUnicode() is deprecated; "
3575 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3576 return NULL;
3577
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003578 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003580
3581 /* Encode via the codec registry */
3582 v = PyCodec_Encode(unicode, encoding, errors);
3583 if (v == NULL)
3584 goto onError;
3585 if (!PyUnicode_Check(v)) {
3586 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003587 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3588 "use codecs.encode() to encode to arbitrary types",
3589 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003590 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003591 Py_DECREF(v);
3592 goto onError;
3593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003595
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return NULL;
3598}
3599
Victor Stinner2cba6b82018-01-10 22:46:15 +01003600static PyObject*
3601unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3602 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003603{
Victor Stinner1b579672011-12-17 05:47:23 +01003604 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003605 if (locale_error_handler(errors, &surrogateescape) < 0)
3606 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003607
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003608 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3609 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610 return NULL;
3611 }
3612
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003613 wchar_t *wstr;
3614 size_t wlen;
3615 const char *reason;
3616 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3617 current_locale, surrogateescape);
3618 if (res != 0) {
3619 if (res == -2) {
3620 PyObject *exc;
3621 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3622 "locale", str, len,
3623 (Py_ssize_t)wlen,
3624 (Py_ssize_t)(wlen + 1),
3625 reason);
3626 if (exc != NULL) {
3627 PyCodec_StrictErrors(exc);
3628 Py_DECREF(exc);
3629 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003630 }
3631 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003632 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003633 }
Victor Stinner2f197072011-12-17 07:08:30 +01003634 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003635 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003636
3637 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3638 PyMem_RawFree(wstr);
3639 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003640}
3641
3642PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003643PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3644 const char *errors)
3645{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003646 return unicode_decode_locale(str, len, errors, 1);
3647}
3648
3649PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003650PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003651{
3652 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003653 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003654}
3655
3656
3657PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003658PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003659 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003660 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3661}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003662
Christian Heimes5894ba72007-11-04 11:43:14 +00003663PyObject*
3664PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3665{
Steve Dowercc16be82016-09-08 10:35:16 -07003666#if defined(__APPLE__)
3667 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003668#else
Victor Stinner793b5312011-04-27 00:24:21 +02003669 PyInterpreterState *interp = PyThreadState_GET()->interp;
3670 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3671 cannot use it to encode and decode filenames before it is loaded. Load
3672 the Python codec requires to encode at least its own filename. Use the C
3673 version of the locale codec until the codec registry is initialized and
3674 the Python codec is loaded.
3675
3676 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3677 cannot only rely on it: check also interp->fscodec_initialized for
3678 subinterpreters. */
3679 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003680 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003681 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003682 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003683 }
3684 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003685 return unicode_decode_locale(s, size,
3686 Py_FileSystemDefaultEncodeErrors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687 }
Victor Stinnerad158722010-10-27 00:25:46 +00003688#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689}
3690
Martin v. Löwis011e8422009-05-05 04:43:17 +00003691
3692int
3693PyUnicode_FSConverter(PyObject* arg, void* addr)
3694{
Brett Cannonec6ce872016-09-06 15:50:29 -07003695 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003696 PyObject *output = NULL;
3697 Py_ssize_t size;
3698 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003699 if (arg == NULL) {
3700 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003701 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003702 return 1;
3703 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003704 path = PyOS_FSPath(arg);
3705 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003706 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003707 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003708 if (PyBytes_Check(path)) {
3709 output = path;
3710 }
3711 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3712 output = PyUnicode_EncodeFSDefault(path);
3713 Py_DECREF(path);
3714 if (!output) {
3715 return 0;
3716 }
3717 assert(PyBytes_Check(output));
3718 }
3719
Victor Stinner0ea2a462010-04-30 00:22:08 +00003720 size = PyBytes_GET_SIZE(output);
3721 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003722 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003723 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003724 Py_DECREF(output);
3725 return 0;
3726 }
3727 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003728 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003729}
3730
3731
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003732int
3733PyUnicode_FSDecoder(PyObject* arg, void* addr)
3734{
Brett Cannona5711202016-09-06 19:36:01 -07003735 int is_buffer = 0;
3736 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003737 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003738 if (arg == NULL) {
3739 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003740 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003741 return 1;
3742 }
Brett Cannona5711202016-09-06 19:36:01 -07003743
3744 is_buffer = PyObject_CheckBuffer(arg);
3745 if (!is_buffer) {
3746 path = PyOS_FSPath(arg);
3747 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003748 return 0;
3749 }
Brett Cannona5711202016-09-06 19:36:01 -07003750 }
3751 else {
3752 path = arg;
3753 Py_INCREF(arg);
3754 }
3755
3756 if (PyUnicode_Check(path)) {
3757 if (PyUnicode_READY(path) == -1) {
3758 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003759 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003760 }
3761 output = path;
3762 }
3763 else if (PyBytes_Check(path) || is_buffer) {
3764 PyObject *path_bytes = NULL;
3765
3766 if (!PyBytes_Check(path) &&
3767 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3768 "path should be string, bytes, or os.PathLike, not %.200s",
3769 Py_TYPE(arg)->tp_name)) {
3770 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003771 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003772 }
3773 path_bytes = PyBytes_FromObject(path);
3774 Py_DECREF(path);
3775 if (!path_bytes) {
3776 return 0;
3777 }
3778 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3779 PyBytes_GET_SIZE(path_bytes));
3780 Py_DECREF(path_bytes);
3781 if (!output) {
3782 return 0;
3783 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003784 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003785 else {
3786 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003787 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003788 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003789 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003790 return 0;
3791 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003792 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003793 Py_DECREF(output);
3794 return 0;
3795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003797 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003798 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003799 Py_DECREF(output);
3800 return 0;
3801 }
3802 *(PyObject**)addr = output;
3803 return Py_CLEANUP_SUPPORTED;
3804}
3805
3806
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003807const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003808PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003809{
Christian Heimesf3863112007-11-22 07:46:41 +00003810 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003812 if (!PyUnicode_Check(unicode)) {
3813 PyErr_BadArgument();
3814 return NULL;
3815 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003817 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003819 if (PyUnicode_UTF8(unicode) == NULL) {
3820 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003821 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 if (bytes == NULL)
3823 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3825 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003826 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 Py_DECREF(bytes);
3828 return NULL;
3829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003831 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 PyBytes_AS_STRING(bytes),
3833 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 Py_DECREF(bytes);
3835 }
3836
3837 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003838 *psize = PyUnicode_UTF8_LENGTH(unicode);
3839 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003840}
3841
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003842const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3846}
3847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848Py_UNICODE *
3849PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3850{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003851 const unsigned char *one_byte;
3852#if SIZEOF_WCHAR_T == 4
3853 const Py_UCS2 *two_bytes;
3854#else
3855 const Py_UCS4 *four_bytes;
3856 const Py_UCS4 *ucs4_end;
3857 Py_ssize_t num_surrogates;
3858#endif
3859 wchar_t *w;
3860 wchar_t *wchar_end;
3861
3862 if (!PyUnicode_Check(unicode)) {
3863 PyErr_BadArgument();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 assert(_PyUnicode_KIND(unicode) != 0);
3869 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003873 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3874 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 num_surrogates = 0;
3876
3877 for (; four_bytes < ucs4_end; ++four_bytes) {
3878 if (*four_bytes > 0xFFFF)
3879 ++num_surrogates;
3880 }
3881
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3883 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3884 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885 PyErr_NoMemory();
3886 return NULL;
3887 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 w = _PyUnicode_WSTR(unicode);
3891 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3892 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3894 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003895 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003897 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3898 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 }
3900 else
3901 *w = *four_bytes;
3902
3903 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07003904 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 }
3906 }
3907 *w = 0;
3908#else
3909 /* sizeof(wchar_t) == 4 */
3910 Py_FatalError("Impossible unicode object state, wstr and str "
3911 "should share memory already.");
3912 return NULL;
3913#endif
3914 }
3915 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003916 if ((size_t)_PyUnicode_LENGTH(unicode) >
3917 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3918 PyErr_NoMemory();
3919 return NULL;
3920 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3922 (_PyUnicode_LENGTH(unicode) + 1));
3923 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 PyErr_NoMemory();
3925 return NULL;
3926 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003927 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3928 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3929 w = _PyUnicode_WSTR(unicode);
3930 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003932 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3933 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 for (; w < wchar_end; ++one_byte, ++w)
3935 *w = *one_byte;
3936 /* null-terminate the wstr */
3937 *w = 0;
3938 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 for (; w < wchar_end; ++two_bytes, ++w)
3943 *w = *two_bytes;
3944 /* null-terminate the wstr */
3945 *w = 0;
3946#else
3947 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 PyObject_FREE(_PyUnicode_WSTR(unicode));
3949 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 Py_FatalError("Impossible unicode object state, wstr "
3951 "and str should share memory already.");
3952 return NULL;
3953#endif
3954 }
3955 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07003956 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 }
3958 }
3959 }
3960 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 *size = PyUnicode_WSTR_LENGTH(unicode);
3962 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003963}
3964
Alexander Belopolsky40018472011-02-26 01:02:56 +00003965Py_UNICODE *
3966PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969}
3970
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003971const Py_UNICODE *
3972_PyUnicode_AsUnicode(PyObject *unicode)
3973{
3974 Py_ssize_t size;
3975 const Py_UNICODE *wstr;
3976
3977 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3978 if (wstr && wcslen(wstr) != (size_t)size) {
3979 PyErr_SetString(PyExc_ValueError, "embedded null character");
3980 return NULL;
3981 }
3982 return wstr;
3983}
3984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985
Alexander Belopolsky40018472011-02-26 01:02:56 +00003986Py_ssize_t
3987PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
3989 if (!PyUnicode_Check(unicode)) {
3990 PyErr_BadArgument();
3991 goto onError;
3992 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003993 if (_PyUnicode_WSTR(unicode) == NULL) {
3994 if (PyUnicode_AsUnicode(unicode) == NULL)
3995 goto onError;
3996 }
3997 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 return -1;
4001}
4002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003Py_ssize_t
4004PyUnicode_GetLength(PyObject *unicode)
4005{
Victor Stinner07621332012-06-16 04:53:46 +02004006 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 PyErr_BadArgument();
4008 return -1;
4009 }
Victor Stinner07621332012-06-16 04:53:46 +02004010 if (PyUnicode_READY(unicode) == -1)
4011 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_GET_LENGTH(unicode);
4013}
4014
4015Py_UCS4
4016PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4017{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004018 void *data;
4019 int kind;
4020
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004021 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004022 PyErr_BadArgument();
4023 return (Py_UCS4)-1;
4024 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004025 if (PyUnicode_READY(unicode) == -1) {
4026 return (Py_UCS4)-1;
4027 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004028 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004029 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 return (Py_UCS4)-1;
4031 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004032 data = PyUnicode_DATA(unicode);
4033 kind = PyUnicode_KIND(unicode);
4034 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035}
4036
4037int
4038PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4039{
4040 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004041 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return -1;
4043 }
Victor Stinner488fa492011-12-12 00:01:39 +01004044 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004045 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004046 PyErr_SetString(PyExc_IndexError, "string index out of range");
4047 return -1;
4048 }
Victor Stinner488fa492011-12-12 00:01:39 +01004049 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004050 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004051 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4052 PyErr_SetString(PyExc_ValueError, "character out of range");
4053 return -1;
4054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4056 index, ch);
4057 return 0;
4058}
4059
Alexander Belopolsky40018472011-02-26 01:02:56 +00004060const char *
4061PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004062{
Victor Stinner42cb4622010-09-01 19:39:01 +00004063 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004064}
4065
Victor Stinner554f3f02010-06-16 23:33:54 +00004066/* create or adjust a UnicodeDecodeError */
4067static void
4068make_decode_exception(PyObject **exceptionObject,
4069 const char *encoding,
4070 const char *input, Py_ssize_t length,
4071 Py_ssize_t startpos, Py_ssize_t endpos,
4072 const char *reason)
4073{
4074 if (*exceptionObject == NULL) {
4075 *exceptionObject = PyUnicodeDecodeError_Create(
4076 encoding, input, length, startpos, endpos, reason);
4077 }
4078 else {
4079 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4080 goto onError;
4081 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4082 goto onError;
4083 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4084 goto onError;
4085 }
4086 return;
4087
4088onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004089 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004090}
4091
Steve Dowercc16be82016-09-08 10:35:16 -07004092#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093/* error handling callback helper:
4094 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004095 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096 and adjust various state variables.
4097 return 0 on success, -1 on error
4098*/
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004101unicode_decode_call_errorhandler_wchar(
4102 const char *errors, PyObject **errorHandler,
4103 const char *encoding, const char *reason,
4104 const char **input, const char **inend, Py_ssize_t *startinpos,
4105 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4106 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004108 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 PyObject *restuple = NULL;
4111 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004112 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004113 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004114 Py_ssize_t requiredsize;
4115 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004116 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004117 wchar_t *repwstr;
4118 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004120 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4121 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 *errorHandler = PyCodec_LookupError(errors);
4125 if (*errorHandler == NULL)
4126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 }
4128
Victor Stinner554f3f02010-06-16 23:33:54 +00004129 make_decode_exception(exceptionObject,
4130 encoding,
4131 *input, *inend - *input,
4132 *startinpos, *endinpos,
4133 reason);
4134 if (*exceptionObject == NULL)
4135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004137 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004141 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004144 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004146
4147 /* Copy back the bytes variables, which might have been modified by the
4148 callback */
4149 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4150 if (!inputobj)
4151 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004152 *input = PyBytes_AS_STRING(inputobj);
4153 insize = PyBytes_GET_SIZE(inputobj);
4154 *inend = *input + insize;
4155 /* we can DECREF safely, as the exception has another reference,
4156 so the object won't go away. */
4157 Py_DECREF(inputobj);
4158
4159 if (newpos<0)
4160 newpos = insize+newpos;
4161 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004162 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004163 goto onError;
4164 }
4165
4166 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4167 if (repwstr == NULL)
4168 goto onError;
4169 /* need more space? (at least enough for what we
4170 have+the replacement+the rest of the string (starting
4171 at the new input position), so we won't have to check space
4172 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004173 requiredsize = *outpos;
4174 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4175 goto overflow;
4176 requiredsize += repwlen;
4177 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4178 goto overflow;
4179 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004180 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004181 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 requiredsize = 2*outsize;
4183 if (unicode_resize(output, requiredsize) < 0)
4184 goto onError;
4185 }
4186 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4187 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 *endinpos = newpos;
4189 *inptr = *input + newpos;
4190
4191 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004192 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 return 0;
4194
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004195 overflow:
4196 PyErr_SetString(PyExc_OverflowError,
4197 "decoded result is too long for a Python string");
4198
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004199 onError:
4200 Py_XDECREF(restuple);
4201 return -1;
4202}
Steve Dowercc16be82016-09-08 10:35:16 -07004203#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204
4205static int
4206unicode_decode_call_errorhandler_writer(
4207 const char *errors, PyObject **errorHandler,
4208 const char *encoding, const char *reason,
4209 const char **input, const char **inend, Py_ssize_t *startinpos,
4210 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4211 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4212{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004213 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004214
4215 PyObject *restuple = NULL;
4216 PyObject *repunicode = NULL;
4217 Py_ssize_t insize;
4218 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004219 Py_ssize_t replen;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004220 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004221 PyObject *inputobj = NULL;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004222 int need_to_grow = 0;
4223 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004224
4225 if (*errorHandler == NULL) {
4226 *errorHandler = PyCodec_LookupError(errors);
4227 if (*errorHandler == NULL)
4228 goto onError;
4229 }
4230
4231 make_decode_exception(exceptionObject,
4232 encoding,
4233 *input, *inend - *input,
4234 *startinpos, *endinpos,
4235 reason);
4236 if (*exceptionObject == NULL)
4237 goto onError;
4238
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004239 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 if (restuple == NULL)
4241 goto onError;
4242 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004243 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004244 goto onError;
4245 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004246 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004247 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004248
4249 /* Copy back the bytes variables, which might have been modified by the
4250 callback */
4251 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4252 if (!inputobj)
4253 goto onError;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004254 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004255 *input = PyBytes_AS_STRING(inputobj);
4256 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004257 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004258 /* we can DECREF safely, as the exception has another reference,
4259 so the object won't go away. */
4260 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004264 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004265 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268
Victor Stinner170ca6f2013-04-18 00:25:28 +02004269 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004270 if (replen > 1) {
4271 writer->min_length += replen - 1;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004272 need_to_grow = 1;
4273 }
4274 new_inptr = *input + newpos;
4275 if (*inend - new_inptr > remain) {
4276 /* We don't know the decoding algorithm here so we make the worst
4277 assumption that one byte decodes to one unicode character.
4278 If unfortunately one byte could decode to more unicode characters,
4279 the decoder may write out-of-bound then. Is it possible for the
4280 algorithms using this function? */
4281 writer->min_length += *inend - new_inptr - remain;
4282 need_to_grow = 1;
4283 }
4284 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004285 writer->overallocate = 1;
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08004286 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004287 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4288 goto onError;
4289 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004291 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 *endinpos = newpos;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004294 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004297 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303}
4304
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305/* --- UTF-7 Codec -------------------------------------------------------- */
4306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307/* See RFC2152 for details. We encode conservatively and decode liberally. */
4308
4309/* Three simple macros defining base-64. */
4310
4311/* Is c a base-64 character? */
4312
4313#define IS_BASE64(c) \
4314 (((c) >= 'A' && (c) <= 'Z') || \
4315 ((c) >= 'a' && (c) <= 'z') || \
4316 ((c) >= '0' && (c) <= '9') || \
4317 (c) == '+' || (c) == '/')
4318
4319/* given that c is a base-64 character, what is its base-64 value? */
4320
4321#define FROM_BASE64(c) \
4322 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4323 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4324 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4325 (c) == '+' ? 62 : 63)
4326
4327/* What is the base-64 character of the bottom 6 bits of n? */
4328
4329#define TO_BASE64(n) \
4330 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4331
4332/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4333 * decoded as itself. We are permissive on decoding; the only ASCII
4334 * byte not decoding to itself is the + which begins a base64
4335 * string. */
4336
4337#define DECODE_DIRECT(c) \
4338 ((c) <= 127 && (c) != '+')
4339
4340/* The UTF-7 encoder treats ASCII characters differently according to
4341 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4342 * the above). See RFC2152. This array identifies these different
4343 * sets:
4344 * 0 : "Set D"
4345 * alphanumeric and '(),-./:?
4346 * 1 : "Set O"
4347 * !"#$%&*;<=>@[]^_`{|}
4348 * 2 : "whitespace"
4349 * ht nl cr sp
4350 * 3 : special (must be base64 encoded)
4351 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4352 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353
Tim Petersced69f82003-09-16 20:30:58 +00004354static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355char utf7_category[128] = {
4356/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4357 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4358/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4359 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4360/* sp ! " # $ % & ' ( ) * + , - . / */
4361 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4362/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4363 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4364/* @ A B C D E F G H I J K L M N O */
4365 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4366/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4368/* ` a b c d e f g h i j k l m n o */
4369 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4370/* p q r s t u v w x y z { | } ~ del */
4371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372};
4373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374/* ENCODE_DIRECT: this character should be encoded as itself. The
4375 * answer depends on whether we are encoding set O as itself, and also
4376 * on whether we are encoding whitespace as itself. RFC2152 makes it
4377 * clear that the answers to these questions vary between
4378 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004379
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380#define ENCODE_DIRECT(c, directO, directWS) \
4381 ((c) < 128 && (c) > 0 && \
4382 ((utf7_category[(c)] == 0) || \
4383 (directWS && (utf7_category[(c)] == 2)) || \
4384 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385
Alexander Belopolsky40018472011-02-26 01:02:56 +00004386PyObject *
4387PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004388 Py_ssize_t size,
4389 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004391 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4392}
4393
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394/* The decoder. The only state we preserve is our read position,
4395 * i.e. how many characters we have consumed. So if we end in the
4396 * middle of a shift sequence we have to back off the read position
4397 * and the output to the beginning of the sequence, otherwise we lose
4398 * all the shift state (seen bits, number of bits seen, high
4399 * surrogate). */
4400
Alexander Belopolsky40018472011-02-26 01:02:56 +00004401PyObject *
4402PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004403 Py_ssize_t size,
4404 const char *errors,
4405 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004408 Py_ssize_t startinpos;
4409 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 const char *errmsg = "";
4413 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 unsigned int base64bits = 0;
4416 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004417 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 PyObject *errorHandler = NULL;
4419 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004421 if (size == 0) {
4422 if (consumed)
4423 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004424 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004425 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004427 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004428 _PyUnicodeWriter_Init(&writer);
4429 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430
4431 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 e = s + size;
4433
4434 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004435 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004437 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (inShift) { /* in a base-64 section */
4440 if (IS_BASE64(ch)) { /* consume a base-64 character */
4441 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4442 base64bits += 6;
4443 s++;
4444 if (base64bits >= 16) {
4445 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004446 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 base64bits -= 16;
4448 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004449 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 if (surrogate) {
4451 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004452 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4453 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004454 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004455 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004457 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 }
4459 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004460 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004461 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 }
4464 }
Victor Stinner551ac952011-11-29 22:58:13 +01004465 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 /* first surrogate */
4467 surrogate = outCh;
4468 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004470 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 }
4473 }
4474 }
4475 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 if (base64bits > 0) { /* left-over bits */
4478 if (base64bits >= 6) {
4479 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004480 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 errmsg = "partial character in shift sequence";
4482 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 else {
4485 /* Some bits remain; they should be zero */
4486 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004487 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 errmsg = "non-zero padding bits in shift sequence";
4489 goto utf7Error;
4490 }
4491 }
4492 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004493 if (surrogate && DECODE_DIRECT(ch)) {
4494 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4495 goto onError;
4496 }
4497 surrogate = 0;
4498 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 /* '-' is absorbed; other terminating
4500 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004501 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 }
4504 }
4505 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 s++; /* consume '+' */
4508 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004510 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004511 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 }
4513 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004515 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004516 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004518 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 }
4520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004523 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 else {
4527 startinpos = s-starts;
4528 s++;
4529 errmsg = "unexpected special character";
4530 goto utf7Error;
4531 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004535 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 errors, &errorHandler,
4537 "utf7", errmsg,
4538 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004539 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
4542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 /* end of string */
4544
4545 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4546 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004547 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 if (surrogate ||
4549 (base64bits >= 6) ||
4550 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004552 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 errors, &errorHandler,
4554 "utf7", "unterminated shift sequence",
4555 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004556 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 goto onError;
4558 if (s < e)
4559 goto restart;
4560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562
4563 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004564 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004567 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004568 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 writer.kind, writer.data, shiftOutStart);
4570 Py_XDECREF(errorHandler);
4571 Py_XDECREF(exc);
4572 _PyUnicodeWriter_Dealloc(&writer);
4573 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004574 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004575 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
4577 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004578 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004580 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 Py_XDECREF(errorHandler);
4583 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004584 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_XDECREF(errorHandler);
4588 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004589 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590 return NULL;
4591}
4592
4593
Alexander Belopolsky40018472011-02-26 01:02:56 +00004594PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004595_PyUnicode_EncodeUTF7(PyObject *str,
4596 int base64SetO,
4597 int base64WhiteSpace,
4598 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004600 int kind;
4601 void *data;
4602 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004603 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004605 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 unsigned int base64bits = 0;
4607 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 char * out;
4609 char * start;
4610
Benjamin Petersonbac79492012-01-14 13:34:47 -05004611 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004612 return NULL;
4613 kind = PyUnicode_KIND(str);
4614 data = PyUnicode_DATA(str);
4615 len = PyUnicode_GET_LENGTH(str);
4616
4617 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004620 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004621 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004622 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004623 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624 if (v == NULL)
4625 return NULL;
4626
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004627 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004628 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004629 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 if (inShift) {
4632 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4633 /* shifting out */
4634 if (base64bits) { /* output remaining bits */
4635 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4636 base64buffer = 0;
4637 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638 }
4639 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 /* Characters not in the BASE64 set implicitly unshift the sequence
4641 so no '-' is required, except if the character is itself a '-' */
4642 if (IS_BASE64(ch) || ch == '-') {
4643 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 *out++ = (char) ch;
4646 }
4647 else {
4648 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 else { /* not in a shift sequence */
4652 if (ch == '+') {
4653 *out++ = '+';
4654 *out++ = '-';
4655 }
4656 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4657 *out++ = (char) ch;
4658 }
4659 else {
4660 *out++ = '+';
4661 inShift = 1;
4662 goto encode_char;
4663 }
4664 }
4665 continue;
4666encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004668 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004669
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670 /* code first surrogate */
4671 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004672 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 while (base64bits >= 6) {
4674 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4675 base64bits -= 6;
4676 }
4677 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004678 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 base64bits += 16;
4681 base64buffer = (base64buffer << 16) | ch;
4682 while (base64bits >= 6) {
4683 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4684 base64bits -= 6;
4685 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004686 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 if (base64bits)
4688 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4689 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004690 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004691 if (_PyBytes_Resize(&v, out - start) < 0)
4692 return NULL;
4693 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004695PyObject *
4696PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4697 Py_ssize_t size,
4698 int base64SetO,
4699 int base64WhiteSpace,
4700 const char *errors)
4701{
4702 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004703 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004704 if (tmp == NULL)
4705 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004706 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004707 base64WhiteSpace, errors);
4708 Py_DECREF(tmp);
4709 return result;
4710}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712#undef IS_BASE64
4713#undef FROM_BASE64
4714#undef TO_BASE64
4715#undef DECODE_DIRECT
4716#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718/* --- UTF-8 Codec -------------------------------------------------------- */
4719
Alexander Belopolsky40018472011-02-26 01:02:56 +00004720PyObject *
4721PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004722 Py_ssize_t size,
4723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724{
Walter Dörwald69652032004-09-07 20:24:22 +00004725 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4726}
4727
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728#include "stringlib/asciilib.h"
4729#include "stringlib/codecs.h"
4730#include "stringlib/undef.h"
4731
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004732#include "stringlib/ucs1lib.h"
4733#include "stringlib/codecs.h"
4734#include "stringlib/undef.h"
4735
4736#include "stringlib/ucs2lib.h"
4737#include "stringlib/codecs.h"
4738#include "stringlib/undef.h"
4739
4740#include "stringlib/ucs4lib.h"
4741#include "stringlib/codecs.h"
4742#include "stringlib/undef.h"
4743
Antoine Pitrouab868312009-01-10 15:40:25 +00004744/* Mask to quickly check whether a C 'long' contains a
4745 non-ASCII, UTF8-encoded char. */
4746#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004747# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004748#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004749# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004750#else
4751# error C 'long' size should be either 4 or 8!
4752#endif
4753
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754static Py_ssize_t
4755ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004756{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004758 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004760 /*
4761 * Issue #17237: m68k is a bit different from most architectures in
4762 * that objects do not use "natural alignment" - for example, int and
4763 * long are only aligned at 2-byte boundaries. Therefore the assert()
4764 * won't work; also, tests have shown that skipping the "optimised
4765 * version" will even speed up m68k.
4766 */
4767#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004769 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4770 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 /* Fast path, see in STRINGLIB(utf8_decode) for
4772 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004773 /* Help allocation */
4774 const char *_p = p;
4775 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 while (_p < aligned_end) {
4777 unsigned long value = *(const unsigned long *) _p;
4778 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 *((unsigned long *)q) = value;
4781 _p += SIZEOF_LONG;
4782 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004783 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 p = _p;
4785 while (p < end) {
4786 if ((unsigned char)*p & 0x80)
4787 break;
4788 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004793#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (p < end) {
4795 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4796 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004797 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004798 /* Help allocation */
4799 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 while (_p < aligned_end) {
4801 unsigned long value = *(unsigned long *) _p;
4802 if (value & ASCII_CHAR_MASK)
4803 break;
4804 _p += SIZEOF_LONG;
4805 }
4806 p = _p;
4807 if (_p == end)
4808 break;
4809 }
4810 if ((unsigned char)*p & 0x80)
4811 break;
4812 ++p;
4813 }
4814 memcpy(dest, start, p - start);
4815 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816}
Antoine Pitrouab868312009-01-10 15:40:25 +00004817
Victor Stinner785938e2011-12-11 20:09:03 +01004818PyObject *
4819PyUnicode_DecodeUTF8Stateful(const char *s,
4820 Py_ssize_t size,
4821 const char *errors,
4822 Py_ssize_t *consumed)
4823{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004824 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004825 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827
4828 Py_ssize_t startinpos;
4829 Py_ssize_t endinpos;
4830 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004831 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004833 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004834
4835 if (size == 0) {
4836 if (consumed)
4837 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004838 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004839 }
4840
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4842 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004843 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 *consumed = 1;
4845 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004846 }
4847
Victor Stinner8f674cc2013-04-17 23:02:17 +02004848 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004849 writer.min_length = size;
4850 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004852
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004853 writer.pos = ascii_decode(s, end, writer.data);
4854 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 while (s < end) {
4856 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004858
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004860 if (PyUnicode_IS_ASCII(writer.buffer))
4861 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004863 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004865 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 } else {
4867 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004868 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 }
4870
4871 switch (ch) {
4872 case 0:
4873 if (s == end || consumed)
4874 goto End;
4875 errmsg = "unexpected end of data";
4876 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004877 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 break;
4879 case 1:
4880 errmsg = "invalid start byte";
4881 startinpos = s - starts;
4882 endinpos = startinpos + 1;
4883 break;
4884 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004885 case 3:
4886 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 errmsg = "invalid continuation byte";
4888 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004889 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 break;
4891 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004892 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893 goto onError;
4894 continue;
4895 }
4896
Victor Stinner1d65d912015-10-05 13:43:50 +02004897 if (error_handler == _Py_ERROR_UNKNOWN)
4898 error_handler = get_error_handler(errors);
4899
4900 switch (error_handler) {
4901 case _Py_ERROR_IGNORE:
4902 s += (endinpos - startinpos);
4903 break;
4904
4905 case _Py_ERROR_REPLACE:
4906 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4907 goto onError;
4908 s += (endinpos - startinpos);
4909 break;
4910
4911 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004912 {
4913 Py_ssize_t i;
4914
Victor Stinner1d65d912015-10-05 13:43:50 +02004915 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4916 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004917 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004918 ch = (Py_UCS4)(unsigned char)(starts[i]);
4919 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4920 ch + 0xdc00);
4921 writer.pos++;
4922 }
4923 s += (endinpos - startinpos);
4924 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004925 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004926
4927 default:
4928 if (unicode_decode_call_errorhandler_writer(
4929 errors, &error_handler_obj,
4930 "utf-8", errmsg,
4931 &starts, &end, &startinpos, &endinpos, &exc, &s,
4932 &writer))
4933 goto onError;
4934 }
Victor Stinner785938e2011-12-11 20:09:03 +01004935 }
4936
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938 if (consumed)
4939 *consumed = s - starts;
4940
Victor Stinner1d65d912015-10-05 13:43:50 +02004941 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004943 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944
4945onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004946 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004950}
4951
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004952
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004953/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4954 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004955
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004956 On success, write a pointer to a newly allocated wide character string into
4957 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4958 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004959
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004960 On memory allocation failure, return -1.
4961
4962 On decoding error (if surrogateescape is zero), return -2. If wlen is
4963 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4964 is not NULL, write the decoding error message into *reason. */
4965int
4966_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4967 const char **reason, int surrogateescape)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004968{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004969 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 wchar_t *unicode;
4972 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004973
4974 /* Note: size will always be longer than the resulting Unicode
4975 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004976 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004977 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004978 }
4979
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004980 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004981 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004982 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004983 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004986 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004992#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 if (ch > 0xFF) {
4996#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004997 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004999 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005000 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5002 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5003#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 else {
5006 if (!ch && s == e)
5007 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005008 if (!surrogateescape) {
5009 PyMem_RawFree(unicode );
5010 if (reason != NULL) {
5011 switch (ch) {
5012 case 0:
5013 *reason = "unexpected end of data";
5014 break;
5015 case 1:
5016 *reason = "invalid start byte";
5017 break;
5018 /* 2, 3, 4 */
5019 default:
5020 *reason = "invalid continuation byte";
5021 break;
5022 }
5023 }
5024 if (wlen != NULL) {
5025 *wlen = s - orig_s;
5026 }
5027 return -2;
5028 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005029 /* surrogateescape */
5030 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5031 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005032 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005034 if (wlen) {
5035 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005036 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005037 *wstr = unicode;
5038 return 0;
5039}
5040
5041wchar_t*
5042_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5043{
5044 wchar_t *wstr;
5045 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5046 if (res != 0) {
5047 return NULL;
5048 }
5049 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005050}
5051
Antoine Pitrouab868312009-01-10 15:40:25 +00005052
Victor Stinnere47e6982017-12-21 15:45:16 +01005053/* UTF-8 encoder using the surrogateescape error handler .
5054
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005055 On success, return 0 and write the newly allocated character string (use
5056 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005057
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005058 On encoding failure, return -2 and write the position of the invalid
5059 surrogate character into *error_pos (if error_pos is set) and the decoding
5060 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005061
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005062 On memory allocation failure, return -1. */
5063int
5064_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5065 const char **reason, int raw_malloc, int surrogateescape)
Victor Stinnere47e6982017-12-21 15:45:16 +01005066{
5067 const Py_ssize_t max_char_size = 4;
5068 Py_ssize_t len = wcslen(text);
5069
5070 assert(len >= 0);
5071
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005072 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5073 return -1;
5074 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005075 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 if (raw_malloc) {
5077 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005078 }
5079 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005080 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005081 }
5082 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005083 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005084 }
5085
5086 char *p = bytes;
5087 Py_ssize_t i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 for (i = 0; i < len; i++) {
5089 Py_UCS4 ch = text[i];
Victor Stinnere47e6982017-12-21 15:45:16 +01005090
5091 if (ch < 0x80) {
5092 /* Encode ASCII */
5093 *p++ = (char) ch;
5094
5095 }
5096 else if (ch < 0x0800) {
5097 /* Encode Latin-1 */
5098 *p++ = (char)(0xc0 | (ch >> 6));
5099 *p++ = (char)(0x80 | (ch & 0x3f));
5100 }
5101 else if (Py_UNICODE_IS_SURROGATE(ch)) {
5102 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005103 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005104 if (error_pos != NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005105 *error_pos = (size_t)i;
Victor Stinnere47e6982017-12-21 15:45:16 +01005106 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 if (reason != NULL) {
5108 *reason = "encoding error";
5109 }
5110 if (raw_malloc) {
5111 PyMem_RawFree(bytes);
5112 }
5113 else {
5114 PyMem_Free(bytes);
5115 }
5116 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005117 }
5118 *p++ = (char)(ch & 0xff);
5119 }
5120 else if (ch < 0x10000) {
5121 *p++ = (char)(0xe0 | (ch >> 12));
5122 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5123 *p++ = (char)(0x80 | (ch & 0x3f));
5124 }
5125 else { /* ch >= 0x10000 */
5126 assert(ch <= MAX_UNICODE);
5127 /* Encode UCS4 Unicode ordinals */
5128 *p++ = (char)(0xf0 | (ch >> 18));
5129 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5130 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5131 *p++ = (char)(0x80 | (ch & 0x3f));
5132 }
5133 }
5134 *p++ = '\0';
5135
5136 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005137 char *bytes2;
5138 if (raw_malloc) {
5139 bytes2 = PyMem_RawRealloc(bytes, final_size);
5140 }
5141 else {
5142 bytes2 = PyMem_Realloc(bytes, final_size);
5143 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005144 if (bytes2 == NULL) {
5145 if (error_pos != NULL) {
5146 *error_pos = (size_t)-1;
5147 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005148 if (raw_malloc) {
5149 PyMem_RawFree(bytes);
5150 }
5151 else {
5152 PyMem_Free(bytes);
5153 }
5154 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005155 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 *str = bytes2;
5157 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005158}
5159
5160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161/* Primary internal function which creates utf8 encoded bytes objects.
5162
5163 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005164 and allocate exactly as much space needed at the end. Else allocate the
5165 maximum possible needed (4 result bytes per Unicode character), and return
5166 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005167*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005168PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005169_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Victor Stinner6099a032011-12-18 14:22:26 +01005171 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 void *data;
5173 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175 if (!PyUnicode_Check(unicode)) {
5176 PyErr_BadArgument();
5177 return NULL;
5178 }
5179
5180 if (PyUnicode_READY(unicode) == -1)
5181 return NULL;
5182
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005183 if (PyUnicode_UTF8(unicode))
5184 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5185 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186
5187 kind = PyUnicode_KIND(unicode);
5188 data = PyUnicode_DATA(unicode);
5189 size = PyUnicode_GET_LENGTH(unicode);
5190
Benjamin Petersonead6b532011-12-20 17:23:42 -06005191 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005193 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005194 case PyUnicode_1BYTE_KIND:
5195 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5196 assert(!PyUnicode_IS_ASCII(unicode));
5197 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5198 case PyUnicode_2BYTE_KIND:
5199 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5200 case PyUnicode_4BYTE_KIND:
5201 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203}
5204
Alexander Belopolsky40018472011-02-26 01:02:56 +00005205PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5207 Py_ssize_t size,
5208 const char *errors)
5209{
5210 PyObject *v, *unicode;
5211
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005212 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 if (unicode == NULL)
5214 return NULL;
5215 v = _PyUnicode_AsUTF8String(unicode, errors);
5216 Py_DECREF(unicode);
5217 return v;
5218}
5219
5220PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005223 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224}
5225
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226/* --- UTF-32 Codec ------------------------------------------------------- */
5227
5228PyObject *
5229PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 Py_ssize_t size,
5231 const char *errors,
5232 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233{
5234 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5235}
5236
5237PyObject *
5238PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 Py_ssize_t size,
5240 const char *errors,
5241 int *byteorder,
5242 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243{
5244 const char *starts = s;
5245 Py_ssize_t startinpos;
5246 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005248 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005249 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005250 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005254
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255 q = (unsigned char *)s;
5256 e = q + size;
5257
5258 if (byteorder)
5259 bo = *byteorder;
5260
5261 /* Check for BOM marks (U+FEFF) in the input and adjust current
5262 byte order setting accordingly. In native mode, the leading BOM
5263 mark is skipped, in all other modes, it is copied to the output
5264 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005265 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005266 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005267 if (bom == 0x0000FEFF) {
5268 bo = -1;
5269 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005271 else if (bom == 0xFFFE0000) {
5272 bo = 1;
5273 q += 4;
5274 }
5275 if (byteorder)
5276 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005277 }
5278
Victor Stinnere64322e2012-10-30 23:12:47 +01005279 if (q == e) {
5280 if (consumed)
5281 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005282 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005283 }
5284
Victor Stinnere64322e2012-10-30 23:12:47 +01005285#ifdef WORDS_BIGENDIAN
5286 le = bo < 0;
5287#else
5288 le = bo <= 0;
5289#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005290 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005291
Victor Stinner8f674cc2013-04-17 23:02:17 +02005292 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005293 writer.min_length = (e - q + 3) / 4;
5294 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005296
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 while (1) {
5298 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005300
Victor Stinnere64322e2012-10-30 23:12:47 +01005301 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 enum PyUnicode_Kind kind = writer.kind;
5303 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005304 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 if (le) {
5307 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005308 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005309 if (ch > maxch)
5310 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005311 if (kind != PyUnicode_1BYTE_KIND &&
5312 Py_UNICODE_IS_SURROGATE(ch))
5313 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005315 q += 4;
5316 } while (q <= last);
5317 }
5318 else {
5319 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005320 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005321 if (ch > maxch)
5322 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005323 if (kind != PyUnicode_1BYTE_KIND &&
5324 Py_UNICODE_IS_SURROGATE(ch))
5325 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005326 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005327 q += 4;
5328 } while (q <= last);
5329 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005331 }
5332
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005334 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
5337 }
5338 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 startinpos = ((const char *)q) - starts;
5344 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005346 else {
5347 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005348 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005349 goto onError;
5350 q += 4;
5351 continue;
5352 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005353 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005354 startinpos = ((const char *)q) - starts;
5355 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005357
5358 /* The remaining input chars are ignored if the callback
5359 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005362 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005366 }
5367
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005376 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
5379 return NULL;
5380}
5381
5382PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383_PyUnicode_EncodeUTF32(PyObject *str,
5384 const char *errors,
5385 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005386{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005387 enum PyUnicode_Kind kind;
5388 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005391 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005392#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005397 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005398 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 PyObject *errorHandler = NULL;
5400 PyObject *exc = NULL;
5401 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005402
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 if (!PyUnicode_Check(str)) {
5404 PyErr_BadArgument();
5405 return NULL;
5406 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005407 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005408 return NULL;
5409 kind = PyUnicode_KIND(str);
5410 data = PyUnicode_DATA(str);
5411 len = PyUnicode_GET_LENGTH(str);
5412
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005414 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005416 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417 if (v == NULL)
5418 return NULL;
5419
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005420 /* output buffer is 4-bytes aligned */
5421 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005422 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005423 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005424 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005429 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005432 else
5433 encoding = "utf-32";
5434
5435 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5437 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005438 }
5439
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005440 pos = 0;
5441 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005443
5444 if (kind == PyUnicode_2BYTE_KIND) {
5445 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5446 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005447 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005448 else {
5449 assert(kind == PyUnicode_4BYTE_KIND);
5450 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5451 &out, native_ordering);
5452 }
5453 if (pos == len)
5454 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005455
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 rep = unicode_encode_call_errorhandler(
5457 errors, &errorHandler,
5458 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005459 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005460 if (!rep)
5461 goto error;
5462
5463 if (PyBytes_Check(rep)) {
5464 repsize = PyBytes_GET_SIZE(rep);
5465 if (repsize & 3) {
5466 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005467 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 "surrogates not allowed");
5469 goto error;
5470 }
5471 moreunits = repsize / 4;
5472 }
5473 else {
5474 assert(PyUnicode_Check(rep));
5475 if (PyUnicode_READY(rep) < 0)
5476 goto error;
5477 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5478 if (!PyUnicode_IS_ASCII(rep)) {
5479 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005480 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 "surrogates not allowed");
5482 goto error;
5483 }
5484 }
5485
5486 /* four bytes are reserved for each surrogate */
5487 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005488 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005489 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 /* integer overflow */
5491 PyErr_NoMemory();
5492 goto error;
5493 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005494 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005496 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 }
5498
5499 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005500 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005501 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005502 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005504 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5505 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 }
5507
5508 Py_CLEAR(rep);
5509 }
5510
5511 /* Cut back to size actually needed. This is necessary for, for example,
5512 encoding of a string containing isolated surrogates and the 'ignore'
5513 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005514 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 if (nsize != PyBytes_GET_SIZE(v))
5516 _PyBytes_Resize(&v, nsize);
5517 Py_XDECREF(errorHandler);
5518 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005519 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 error:
5522 Py_XDECREF(rep);
5523 Py_XDECREF(errorHandler);
5524 Py_XDECREF(exc);
5525 Py_XDECREF(v);
5526 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005527}
5528
Alexander Belopolsky40018472011-02-26 01:02:56 +00005529PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5531 Py_ssize_t size,
5532 const char *errors,
5533 int byteorder)
5534{
5535 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005536 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 if (tmp == NULL)
5538 return NULL;
5539 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5540 Py_DECREF(tmp);
5541 return result;
5542}
5543
5544PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005545PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546{
Victor Stinnerb960b342011-11-20 19:12:52 +01005547 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548}
5549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550/* --- UTF-16 Codec ------------------------------------------------------- */
5551
Tim Peters772747b2001-08-09 22:21:55 +00005552PyObject *
5553PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 Py_ssize_t size,
5555 const char *errors,
5556 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557{
Walter Dörwald69652032004-09-07 20:24:22 +00005558 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5559}
5560
5561PyObject *
5562PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 Py_ssize_t size,
5564 const char *errors,
5565 int *byteorder,
5566 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005567{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005569 Py_ssize_t startinpos;
5570 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005572 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005573 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005575 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 PyObject *errorHandler = NULL;
5577 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005578 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Tim Peters772747b2001-08-09 22:21:55 +00005580 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005581 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005584 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005586 /* Check for BOM marks (U+FEFF) in the input and adjust current
5587 byte order setting accordingly. In native mode, the leading BOM
5588 mark is skipped, in all other modes, it is copied to the output
5589 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005590 if (bo == 0 && size >= 2) {
5591 const Py_UCS4 bom = (q[1] << 8) | q[0];
5592 if (bom == 0xFEFF) {
5593 q += 2;
5594 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005596 else if (bom == 0xFFFE) {
5597 q += 2;
5598 bo = 1;
5599 }
5600 if (byteorder)
5601 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 if (q == e) {
5605 if (consumed)
5606 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005607 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005608 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005609
Christian Heimes743e0cd2012-10-17 23:52:17 +02005610#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005613#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005614 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005615 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005616#endif
Tim Peters772747b2001-08-09 22:21:55 +00005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang86fdad02018-01-31 20:48:05 +08005619 character count normally. Error handler will take care of
5620 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005621 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005622 writer.min_length = (e - q + 1) / 2;
5623 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005625
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 while (1) {
5627 Py_UCS4 ch = 0;
5628 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 else
5636 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005638 native_ordering);
5639 } else if (kind == PyUnicode_2BYTE_KIND) {
5640 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005642 native_ordering);
5643 } else {
5644 assert(kind == PyUnicode_4BYTE_KIND);
5645 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005646 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005647 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005648 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650
Antoine Pitrou63065d72012-05-15 23:48:04 +02005651 switch (ch)
5652 {
5653 case 0:
5654 /* remaining byte at the end? (size should be even) */
5655 if (q == e || consumed)
5656 goto End;
5657 errmsg = "truncated data";
5658 startinpos = ((const char *)q) - starts;
5659 endinpos = ((const char *)e) - starts;
5660 break;
5661 /* The remaining input chars are ignored if the callback
5662 chooses to skip the input */
5663 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005664 q -= 2;
5665 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005666 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005667 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005668 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005669 endinpos = ((const char *)e) - starts;
5670 break;
5671 case 2:
5672 errmsg = "illegal encoding";
5673 startinpos = ((const char *)q) - 2 - starts;
5674 endinpos = startinpos + 2;
5675 break;
5676 case 3:
5677 errmsg = "illegal UTF-16 surrogate";
5678 startinpos = ((const char *)q) - 4 - starts;
5679 endinpos = startinpos + 2;
5680 break;
5681 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005682 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 continue;
5685 }
5686
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005688 errors,
5689 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005690 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005691 &starts,
5692 (const char **)&e,
5693 &startinpos,
5694 &endinpos,
5695 &exc,
5696 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
5700
Antoine Pitrou63065d72012-05-15 23:48:04 +02005701End:
Walter Dörwald69652032004-09-07 20:24:22 +00005702 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 Py_XDECREF(errorHandler);
5706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 Py_XDECREF(errorHandler);
5712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 return NULL;
5714}
5715
Tim Peters772747b2001-08-09 22:21:55 +00005716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717_PyUnicode_EncodeUTF16(PyObject *str,
5718 const char *errors,
5719 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005721 enum PyUnicode_Kind kind;
5722 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005724 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005725 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005726 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005727#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005728 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005729#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005731#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 const char *encoding;
5733 Py_ssize_t nsize, pos;
5734 PyObject *errorHandler = NULL;
5735 PyObject *exc = NULL;
5736 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005737
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738 if (!PyUnicode_Check(str)) {
5739 PyErr_BadArgument();
5740 return NULL;
5741 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005742 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 return NULL;
5744 kind = PyUnicode_KIND(str);
5745 data = PyUnicode_DATA(str);
5746 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005747
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 if (kind == PyUnicode_4BYTE_KIND) {
5750 const Py_UCS4 *in = (const Py_UCS4 *)data;
5751 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 while (in < end) {
5753 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 }
5756 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005757 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005758 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 nsize = len + pairs + (byteorder == 0);
5762 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005763 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005767 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005768 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005769 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005770 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005771 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 }
5773 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005775 }
Tim Peters772747b2001-08-09 22:21:55 +00005776
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 if (kind == PyUnicode_1BYTE_KIND) {
5778 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5779 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005780 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005781
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005784 }
5785 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005787 }
5788 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005789 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005790 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005791
5792 pos = 0;
5793 while (pos < len) {
5794 Py_ssize_t repsize, moreunits;
5795
5796 if (kind == PyUnicode_2BYTE_KIND) {
5797 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5798 &out, native_ordering);
5799 }
5800 else {
5801 assert(kind == PyUnicode_4BYTE_KIND);
5802 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5803 &out, native_ordering);
5804 }
5805 if (pos == len)
5806 break;
5807
5808 rep = unicode_encode_call_errorhandler(
5809 errors, &errorHandler,
5810 encoding, "surrogates not allowed",
5811 str, &exc, pos, pos + 1, &pos);
5812 if (!rep)
5813 goto error;
5814
5815 if (PyBytes_Check(rep)) {
5816 repsize = PyBytes_GET_SIZE(rep);
5817 if (repsize & 1) {
5818 raise_encode_exception(&exc, encoding,
5819 str, pos - 1, pos,
5820 "surrogates not allowed");
5821 goto error;
5822 }
5823 moreunits = repsize / 2;
5824 }
5825 else {
5826 assert(PyUnicode_Check(rep));
5827 if (PyUnicode_READY(rep) < 0)
5828 goto error;
5829 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5830 if (!PyUnicode_IS_ASCII(rep)) {
5831 raise_encode_exception(&exc, encoding,
5832 str, pos - 1, pos,
5833 "surrogates not allowed");
5834 goto error;
5835 }
5836 }
5837
5838 /* two bytes are reserved for each surrogate */
5839 if (moreunits > 1) {
5840 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005841 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 /* integer overflow */
5843 PyErr_NoMemory();
5844 goto error;
5845 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005846 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005847 goto error;
5848 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5849 }
5850
5851 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005852 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005853 out += moreunits;
5854 } else /* rep is unicode */ {
5855 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5856 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5857 &out, native_ordering);
5858 }
5859
5860 Py_CLEAR(rep);
5861 }
5862
5863 /* Cut back to size actually needed. This is necessary for, for example,
5864 encoding of a string containing isolated surrogates and the 'ignore' handler
5865 is used. */
5866 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5867 if (nsize != PyBytes_GET_SIZE(v))
5868 _PyBytes_Resize(&v, nsize);
5869 Py_XDECREF(errorHandler);
5870 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005871 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005873 error:
5874 Py_XDECREF(rep);
5875 Py_XDECREF(errorHandler);
5876 Py_XDECREF(exc);
5877 Py_XDECREF(v);
5878 return NULL;
5879#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880}
5881
Alexander Belopolsky40018472011-02-26 01:02:56 +00005882PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5884 Py_ssize_t size,
5885 const char *errors,
5886 int byteorder)
5887{
5888 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005889 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 if (tmp == NULL)
5891 return NULL;
5892 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5893 Py_DECREF(tmp);
5894 return result;
5895}
5896
5897PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005898PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901}
5902
5903/* --- Unicode Escape Codec ----------------------------------------------- */
5904
Fredrik Lundh06d12682001-01-24 07:59:11 +00005905static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005906
Alexander Belopolsky40018472011-02-26 01:02:56 +00005907PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005908_PyUnicode_DecodeUnicodeEscape(const char *s,
5909 Py_ssize_t size,
5910 const char *errors,
5911 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005914 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 PyObject *errorHandler = NULL;
5917 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005918
Eric V. Smith42454af2016-10-31 09:22:08 -04005919 // so we can remember if we've seen an invalid escape char or not
5920 *first_invalid_escape = NULL;
5921
Victor Stinner62ec3312016-09-06 17:04:34 -07005922 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 }
5925 /* Escaped strings will always be longer than the resulting
5926 Unicode string, so we start with size here and then reduce the
5927 length after conversion to the true value.
5928 (but if the error callback returns a long replacement string
5929 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005930 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005931 writer.min_length = size;
5932 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5933 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934 }
5935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 end = s + size;
5937 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005938 unsigned char c = (unsigned char) *s++;
5939 Py_UCS4 ch;
5940 int count;
5941 Py_ssize_t startinpos;
5942 Py_ssize_t endinpos;
5943 const char *message;
5944
5945#define WRITE_ASCII_CHAR(ch) \
5946 do { \
5947 assert(ch <= 127); \
5948 assert(writer.pos < writer.size); \
5949 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5950 } while(0)
5951
5952#define WRITE_CHAR(ch) \
5953 do { \
5954 if (ch <= writer.maxchar) { \
5955 assert(writer.pos < writer.size); \
5956 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5957 } \
5958 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5959 goto onError; \
5960 } \
5961 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
5963 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005964 if (c != '\\') {
5965 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 continue;
5967 }
5968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 if (s >= end) {
5972 message = "\\ at end of string";
5973 goto error;
5974 }
5975 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005976
Victor Stinner62ec3312016-09-06 17:04:34 -07005977 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005978 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005981 case '\n': continue;
5982 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5983 case '\'': WRITE_ASCII_CHAR('\''); continue;
5984 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5985 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005987 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5988 case 't': WRITE_ASCII_CHAR('\t'); continue;
5989 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5990 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005991 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005992 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 case '0': case '1': case '2': case '3':
5998 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005999 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006000 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 ch = (ch<<3) + *s++ - '0';
6002 if (s < end && '0' <= *s && *s <= '7') {
6003 ch = (ch<<3) + *s++ - '0';
6004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006006 WRITE_CHAR(ch);
6007 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* hex escapes */
6010 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006012 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006013 message = "truncated \\xXX escape";
6014 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006018 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006019 message = "truncated \\uXXXX escape";
6020 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006023 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006024 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006025 message = "truncated \\UXXXXXXXX escape";
6026 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006027 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006028 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 ch <<= 4;
6030 if (c >= '0' && c <= '9') {
6031 ch += c - '0';
6032 }
6033 else if (c >= 'a' && c <= 'f') {
6034 ch += c - ('a' - 10);
6035 }
6036 else if (c >= 'A' && c <= 'F') {
6037 ch += c - ('A' - 10);
6038 }
6039 else {
6040 break;
6041 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006042 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006043 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006044 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 }
6046
6047 /* when we get here, ch is a 32-bit unicode character */
6048 if (ch > MAX_UNICODE) {
6049 message = "illegal Unicode character";
6050 goto error;
6051 }
6052
6053 WRITE_CHAR(ch);
6054 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 if (ucnhash_CAPI == NULL) {
6059 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6061 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006062 if (ucnhash_CAPI == NULL) {
6063 PyErr_SetString(
6064 PyExc_UnicodeError,
6065 "\\N escapes not supported (can't load unicodedata module)"
6066 );
6067 goto onError;
6068 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006070
6071 message = "malformed \\N character escape";
Miss Islington (bot)9fbcb142018-11-13 16:39:36 -08006072 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006073 const char *start = ++s;
6074 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006075 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 namelen = s - start;
6079 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006080 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 ch = 0xffffffff; /* in case 'getcode' messes up */
6083 if (namelen <= INT_MAX &&
6084 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6085 &ch, 0)) {
6086 assert(ch <= MAX_UNICODE);
6087 WRITE_CHAR(ch);
6088 continue;
6089 }
6090 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091 }
6092 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006093 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094
6095 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006096 if (*first_invalid_escape == NULL) {
6097 *first_invalid_escape = s-1; /* Back up one char, since we've
6098 already incremented s. */
6099 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006100 WRITE_ASCII_CHAR('\\');
6101 WRITE_CHAR(c);
6102 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006104
6105 error:
6106 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006107 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006108 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006109 errors, &errorHandler,
6110 "unicodeescape", message,
6111 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006113 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006114 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006115 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006116
6117#undef WRITE_ASCII_CHAR
6118#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006120
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006123 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006124
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006126 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 Py_XDECREF(errorHandler);
6128 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return NULL;
6130}
6131
Eric V. Smith42454af2016-10-31 09:22:08 -04006132PyObject *
6133PyUnicode_DecodeUnicodeEscape(const char *s,
6134 Py_ssize_t size,
6135 const char *errors)
6136{
6137 const char *first_invalid_escape;
6138 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6139 &first_invalid_escape);
6140 if (result == NULL)
6141 return NULL;
6142 if (first_invalid_escape != NULL) {
6143 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6144 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006145 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006146 Py_DECREF(result);
6147 return NULL;
6148 }
6149 }
6150 return result;
6151}
6152
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006153/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
Alexander Belopolsky40018472011-02-26 01:02:56 +00006155PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006159 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
Ezio Melottie7f90372012-10-05 03:33:31 +03006165 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006166 escape.
6167
Ezio Melottie7f90372012-10-05 03:33:31 +03006168 For UCS1 strings it's '\xxx', 4 bytes per source character.
6169 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6170 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006171 */
6172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 if (!PyUnicode_Check(unicode)) {
6174 PyErr_BadArgument();
6175 return NULL;
6176 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 }
Victor Stinner358af132015-10-12 22:36:57 +02006180
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 if (len == 0) {
6183 return PyBytes_FromStringAndSize(NULL, 0);
6184 }
6185
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 kind = PyUnicode_KIND(unicode);
6187 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6189 bytes, and 1 byte characters 4. */
6190 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006191 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 return PyErr_NoMemory();
6193 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006194 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 if (repr == NULL) {
6196 return NULL;
6197 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006200 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006201 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006202
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 /* U+0000-U+00ff range */
6204 if (ch < 0x100) {
6205 if (ch >= ' ' && ch < 127) {
6206 if (ch != '\\') {
6207 /* Copy printable US ASCII as-is */
6208 *p++ = (char) ch;
6209 }
6210 /* Escape backslashes */
6211 else {
6212 *p++ = '\\';
6213 *p++ = '\\';
6214 }
6215 }
Victor Stinner358af132015-10-12 22:36:57 +02006216
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 /* Map special whitespace to '\t', \n', '\r' */
6218 else if (ch == '\t') {
6219 *p++ = '\\';
6220 *p++ = 't';
6221 }
6222 else if (ch == '\n') {
6223 *p++ = '\\';
6224 *p++ = 'n';
6225 }
6226 else if (ch == '\r') {
6227 *p++ = '\\';
6228 *p++ = 'r';
6229 }
6230
6231 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6232 else {
6233 *p++ = '\\';
6234 *p++ = 'x';
6235 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6236 *p++ = Py_hexdigits[ch & 0x000F];
6237 }
Tim Petersced69f82003-09-16 20:30:58 +00006238 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006239 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 *p++ = '\\';
6242 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006243 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6246 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6249 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006250
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 /* Make sure that the first two digits are zero */
6252 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006253 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 *p++ = 'U';
6255 *p++ = '0';
6256 *p++ = '0';
6257 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6259 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6261 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6262 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 assert(p - PyBytes_AS_STRING(repr) > 0);
6267 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6268 return NULL;
6269 }
6270 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271}
6272
Alexander Belopolsky40018472011-02-26 01:02:56 +00006273PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006274PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006278 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006279 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 }
6282
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283 result = PyUnicode_AsUnicodeEscapeString(tmp);
6284 Py_DECREF(tmp);
6285 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286}
6287
6288/* --- Raw Unicode Escape Codec ------------------------------------------- */
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290PyObject *
6291PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 Py_ssize_t size,
6293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006296 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 PyObject *errorHandler = NULL;
6299 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006300
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006302 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 /* Escaped strings will always be longer than the resulting
6306 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 length after conversion to the true value. (But decoding error
6308 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006309 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 writer.min_length = size;
6311 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6312 goto onError;
6313 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 end = s + size;
6316 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 unsigned char c = (unsigned char) *s++;
6318 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006319 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 Py_ssize_t startinpos;
6321 Py_ssize_t endinpos;
6322 const char *message;
6323
6324#define WRITE_CHAR(ch) \
6325 do { \
6326 if (ch <= writer.maxchar) { \
6327 assert(writer.pos < writer.size); \
6328 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6329 } \
6330 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6331 goto onError; \
6332 } \
6333 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 if (c != '\\' || s >= end) {
6337 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006340
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 c = (unsigned char) *s++;
6342 if (c == 'u') {
6343 count = 4;
6344 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 else if (c == 'U') {
6347 count = 8;
6348 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006349 }
6350 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 assert(writer.pos < writer.size);
6352 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6353 WRITE_CHAR(c);
6354 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006355 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 startinpos = s - starts - 2;
6357
6358 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6359 for (ch = 0; count && s < end; ++s, --count) {
6360 c = (unsigned char)*s;
6361 ch <<= 4;
6362 if (c >= '0' && c <= '9') {
6363 ch += c - '0';
6364 }
6365 else if (c >= 'a' && c <= 'f') {
6366 ch += c - ('a' - 10);
6367 }
6368 else if (c >= 'A' && c <= 'F') {
6369 ch += c - ('A' - 10);
6370 }
6371 else {
6372 break;
6373 }
6374 }
6375 if (!count) {
6376 if (ch <= MAX_UNICODE) {
6377 WRITE_CHAR(ch);
6378 continue;
6379 }
6380 message = "\\Uxxxxxxxx out of range";
6381 }
6382
6383 endinpos = s-starts;
6384 writer.min_length = end - s + writer.pos;
6385 if (unicode_decode_call_errorhandler_writer(
6386 errors, &errorHandler,
6387 "rawunicodeescape", message,
6388 &starts, &end, &startinpos, &endinpos, &exc, &s,
6389 &writer)) {
6390 goto onError;
6391 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006392 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006393
6394#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006399
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006401 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 int kind;
6416 void *data;
6417 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 if (!PyUnicode_Check(unicode)) {
6420 PyErr_BadArgument();
6421 return NULL;
6422 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426 kind = PyUnicode_KIND(unicode);
6427 data = PyUnicode_DATA(unicode);
6428 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 if (kind == PyUnicode_1BYTE_KIND) {
6430 return PyBytes_FromStringAndSize(data, len);
6431 }
Victor Stinner0e368262011-11-10 20:12:49 +01006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6434 bytes, and 1 byte characters 4. */
6435 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 if (len > PY_SSIZE_T_MAX / expandsize) {
6438 return PyErr_NoMemory();
6439 }
6440 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6441 if (repr == NULL) {
6442 return NULL;
6443 }
6444 if (len == 0) {
6445 return repr;
6446 }
6447
6448 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 for (pos = 0; pos < len; pos++) {
6450 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006451
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6453 if (ch < 0x100) {
6454 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006455 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6457 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 *p++ = '\\';
6459 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6466 else {
6467 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6468 *p++ = '\\';
6469 *p++ = 'U';
6470 *p++ = '0';
6471 *p++ = '0';
6472 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6477 *p++ = Py_hexdigits[ch & 15];
6478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 assert(p > PyBytes_AS_STRING(repr));
6482 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6483 return NULL;
6484 }
6485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6490 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006493 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006495 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6497 Py_DECREF(tmp);
6498 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006501/* --- Unicode Internal Codec ------------------------------------------- */
6502
Alexander Belopolsky40018472011-02-26 01:02:56 +00006503PyObject *
6504_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006505 Py_ssize_t size,
6506 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006507{
6508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006509 Py_ssize_t startinpos;
6510 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006511 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006512 const char *end;
6513 const char *reason;
6514 PyObject *errorHandler = NULL;
6515 PyObject *exc = NULL;
6516
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006517 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006518 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 1))
6520 return NULL;
6521
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006522 if (size < 0) {
6523 PyErr_BadInternalCall();
6524 return NULL;
6525 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006526 if (size == 0)
6527 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006528
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 _PyUnicodeWriter_Init(&writer);
6530 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6531 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 }
6534 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535
Victor Stinner8f674cc2013-04-17 23:02:17 +02006536 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006538 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006539 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006540 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006541 endinpos = end-starts;
6542 reason = "truncated input";
6543 goto error;
6544 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545 /* We copy the raw representation one byte at a time because the
6546 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[0] = s[0];
6548 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ((char *) &uch)[2] = s[2];
6551 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006552#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 /* We have to sanity check the raw data, otherwise doom looms for
6556 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006557 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558 endinpos = s - starts + Py_UNICODE_SIZE;
6559 reason = "illegal code point (> 0x10FFFF)";
6560 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006561 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006562#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 s += Py_UNICODE_SIZE;
6564#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006565 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006566 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006567 Py_UNICODE uch2;
6568 ((char *) &uch2)[0] = s[0];
6569 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006570 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006571 {
Victor Stinner551ac952011-11-29 22:58:13 +01006572 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 }
6575 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576#endif
6577
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006578 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006579 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006580 continue;
6581
6582 error:
6583 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 errors, &errorHandler,
6586 "unicode_internal", reason,
6587 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006588 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006589 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 }
6591
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 Py_XDECREF(errorHandler);
6593 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006594 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006597 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006598 Py_XDECREF(errorHandler);
6599 Py_XDECREF(exc);
6600 return NULL;
6601}
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603/* --- Latin-1 Codec ------------------------------------------------------ */
6604
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
6606PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006607 Py_ssize_t size,
6608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006611 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static void
6616make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006618 PyObject *unicode,
6619 Py_ssize_t startpos, Py_ssize_t endpos,
6620 const char *reason)
6621{
6622 if (*exceptionObject == NULL) {
6623 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006625 encoding, unicode, startpos, endpos, reason);
6626 }
6627 else {
6628 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6629 goto onError;
6630 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6631 goto onError;
6632 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6633 goto onError;
6634 return;
6635 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006636 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006637 }
6638}
6639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006640/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641static void
6642raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006643 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006644 PyObject *unicode,
6645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 const char *reason)
6647{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006648 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006649 encoding, unicode, startpos, endpos, reason);
6650 if (*exceptionObject != NULL)
6651 PyCodec_StrictErrors(*exceptionObject);
6652}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653
6654/* error handling callback helper:
6655 build arguments, call the callback and check the arguments,
6656 put the result into newpos and return the replacement string, which
6657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658static PyObject *
6659unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006660 PyObject **errorHandler,
6661 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 Py_ssize_t startpos, Py_ssize_t endpos,
6664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006666 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 PyObject *restuple;
6669 PyObject *resunicode;
6670
6671 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 }
6676
Benjamin Petersonbac79492012-01-14 13:34:47 -05006677 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 return NULL;
6679 len = PyUnicode_GET_LENGTH(unicode);
6680
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006681 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006686 restuple = PyObject_CallFunctionObjArgs(
6687 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 Py_DECREF(restuple);
6693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006695 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 &resunicode, newpos)) {
6697 Py_DECREF(restuple);
6698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006700 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6701 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6702 Py_DECREF(restuple);
6703 return NULL;
6704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 *newpos = len + *newpos;
6707 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 Py_DECREF(restuple);
6710 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 Py_INCREF(resunicode);
6713 Py_DECREF(restuple);
6714 return resunicode;
6715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006719 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006720 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 /* input state */
6723 Py_ssize_t pos=0, size;
6724 int kind;
6725 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726 /* pointer into the output */
6727 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006728 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6729 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006730 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006732 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006733 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006734 /* output object */
6735 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736
Benjamin Petersonbac79492012-01-14 13:34:47 -05006737 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 return NULL;
6739 size = PyUnicode_GET_LENGTH(unicode);
6740 kind = PyUnicode_KIND(unicode);
6741 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 /* allocate enough for a simple encoding without
6743 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006744 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006745 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006746
6747 _PyBytesWriter_Init(&writer);
6748 str = _PyBytesWriter_Alloc(&writer, size);
6749 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006753 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006756 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006758 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006762 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006765 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006767
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006768 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006770
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006771 /* Only overallocate the buffer if it's not the last write */
6772 writer.overallocate = (collend < size);
6773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006775 if (error_handler == _Py_ERROR_UNKNOWN)
6776 error_handler = get_error_handler(errors);
6777
6778 switch (error_handler) {
6779 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006780 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006782
6783 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006784 memset(str, '?', collend - collstart);
6785 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006786 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006787 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 break;
Victor Stinner50149202015-09-22 00:26:54 +02006790
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006792 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006793 writer.min_size -= (collend - collstart);
6794 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006795 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796 if (str == NULL)
6797 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006798 pos = collend;
6799 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006802 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006803 writer.min_size -= (collend - collstart);
6804 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006805 unicode, collstart, collend);
6806 if (str == NULL)
6807 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 break;
Victor Stinner50149202015-09-22 00:26:54 +02006810
Victor Stinnerc3713e92015-09-29 12:32:13 +02006811 case _Py_ERROR_SURROGATEESCAPE:
6812 for (i = collstart; i < collend; ++i) {
6813 ch = PyUnicode_READ(kind, data, i);
6814 if (ch < 0xdc80 || 0xdcff < ch) {
6815 /* Not a UTF-8b surrogate */
6816 break;
6817 }
6818 *str++ = (char)(ch - 0xdc00);
6819 ++pos;
6820 }
6821 if (i >= collend)
6822 break;
6823 collstart = pos;
6824 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006825 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006828 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6829 encoding, reason, unicode, &exc,
6830 collstart, collend, &newpos);
6831 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006833
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006834 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006835 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006836
Victor Stinner6bd525b2015-10-09 13:10:05 +02006837 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006838 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006839 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006840 PyBytes_AS_STRING(rep),
6841 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006842 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006843 else {
6844 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006845
Victor Stinner6bd525b2015-10-09 13:10:05 +02006846 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006849 if (limit == 256 ?
6850 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6851 !PyUnicode_IS_ASCII(rep))
6852 {
6853 /* Not all characters are smaller than limit */
6854 raise_encode_exception(&exc, encoding, unicode,
6855 collstart, collend, reason);
6856 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006858 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6859 str = _PyBytesWriter_WriteBytes(&writer, str,
6860 PyUnicode_DATA(rep),
6861 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 }
Miss Islington (bot)1e596d32018-08-19 16:17:53 -04006863 if (str == NULL)
6864 goto onError;
6865
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006866 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006867 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006869
6870 /* If overallocation was disabled, ensure that it was the last
6871 write. Otherwise, we missed an optimization */
6872 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006873 }
6874 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006875
Victor Stinner50149202015-09-22 00:26:54 +02006876 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006878 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879
6880 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006881 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006883 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006884 Py_XDECREF(exc);
6885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886}
6887
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889PyObject *
6890PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006891 Py_ssize_t size,
6892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006895 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006896 if (unicode == NULL)
6897 return NULL;
6898 result = unicode_encode_ucs1(unicode, errors, 256);
6899 Py_DECREF(unicode);
6900 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901}
6902
Alexander Belopolsky40018472011-02-26 01:02:56 +00006903PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
6906 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 PyErr_BadArgument();
6908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006910 if (PyUnicode_READY(unicode) == -1)
6911 return NULL;
6912 /* Fast path: if it is a one-byte string, construct
6913 bytes object directly. */
6914 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6915 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6916 PyUnicode_GET_LENGTH(unicode));
6917 /* Non-Latin-1 characters present. Defer to above function to
6918 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006919 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006920}
6921
6922PyObject*
6923PyUnicode_AsLatin1String(PyObject *unicode)
6924{
6925 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926}
6927
6928/* --- 7-bit ASCII Codec -------------------------------------------------- */
6929
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyObject *
6931PyUnicode_DecodeASCII(const char *s,
6932 Py_ssize_t size,
6933 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006936 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006937 int kind;
6938 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 Py_ssize_t startinpos;
6940 Py_ssize_t endinpos;
6941 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006943 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006945 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006948 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006951 if (size == 1 && (unsigned char)s[0] < 128)
6952 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006953
Victor Stinner8f674cc2013-04-17 23:02:17 +02006954 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006955 writer.min_length = size;
6956 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006957 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006961 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 writer.pos = outpos;
6963 if (writer.pos == size)
6964 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006965
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 s += writer.pos;
6967 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006969 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006971 PyUnicode_WRITE(kind, data, writer.pos, c);
6972 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006974 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976
6977 /* byte outsize range 0x00..0x7f: call the error handler */
6978
6979 if (error_handler == _Py_ERROR_UNKNOWN)
6980 error_handler = get_error_handler(errors);
6981
6982 switch (error_handler)
6983 {
6984 case _Py_ERROR_REPLACE:
6985 case _Py_ERROR_SURROGATEESCAPE:
6986 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006987 but we may switch to UCS2 at the first write */
6988 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6989 goto onError;
6990 kind = writer.kind;
6991 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006992
6993 if (error_handler == _Py_ERROR_REPLACE)
6994 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6995 else
6996 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6997 writer.pos++;
6998 ++s;
6999 break;
7000
7001 case _Py_ERROR_IGNORE:
7002 ++s;
7003 break;
7004
7005 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 startinpos = s-starts;
7007 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007009 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 "ascii", "ordinal not in range(128)",
7011 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 kind = writer.kind;
7015 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007020 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007021
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007023 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007024 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007025 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 return NULL;
7027}
7028
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030PyObject *
7031PyUnicode_EncodeASCII(const Py_UNICODE *p,
7032 Py_ssize_t size,
7033 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007035 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007036 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007037 if (unicode == NULL)
7038 return NULL;
7039 result = unicode_encode_ucs1(unicode, errors, 128);
7040 Py_DECREF(unicode);
7041 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042}
7043
Alexander Belopolsky40018472011-02-26 01:02:56 +00007044PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007045_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
7047 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 PyErr_BadArgument();
7049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051 if (PyUnicode_READY(unicode) == -1)
7052 return NULL;
7053 /* Fast path: if it is an ASCII-only string, construct bytes object
7054 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007055 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007056 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7057 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007058 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007059}
7060
7061PyObject *
7062PyUnicode_AsASCIIString(PyObject *unicode)
7063{
7064 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065}
7066
Steve Dowercc16be82016-09-08 10:35:16 -07007067#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007068
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007069/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007070
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007071#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072#define NEED_RETRY
7073#endif
7074
Victor Stinner3a50e702011-10-18 21:21:00 +02007075#ifndef WC_ERR_INVALID_CHARS
7076# define WC_ERR_INVALID_CHARS 0x0080
7077#endif
7078
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007079static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007080code_page_name(UINT code_page, PyObject **obj)
7081{
7082 *obj = NULL;
7083 if (code_page == CP_ACP)
7084 return "mbcs";
7085 if (code_page == CP_UTF7)
7086 return "CP_UTF7";
7087 if (code_page == CP_UTF8)
7088 return "CP_UTF8";
7089
7090 *obj = PyBytes_FromFormat("cp%u", code_page);
7091 if (*obj == NULL)
7092 return NULL;
7093 return PyBytes_AS_STRING(*obj);
7094}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095
Victor Stinner3a50e702011-10-18 21:21:00 +02007096static DWORD
7097decode_code_page_flags(UINT code_page)
7098{
7099 if (code_page == CP_UTF7) {
7100 /* The CP_UTF7 decoder only supports flags=0 */
7101 return 0;
7102 }
7103 else
7104 return MB_ERR_INVALID_CHARS;
7105}
7106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 * Decode a byte string from a Windows code page into unicode object in strict
7109 * mode.
7110 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007111 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7112 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007115decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007116 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 const char *in,
7118 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119{
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007121 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
7124 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 assert(insize > 0);
7126 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7127 if (outsize <= 0)
7128 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129
7130 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007132 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007133 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 if (*v == NULL)
7135 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
7138 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007141 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 }
7145
7146 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7148 if (outsize <= 0)
7149 goto error;
7150 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152error:
7153 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7154 return -2;
7155 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007156 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157}
7158
Victor Stinner3a50e702011-10-18 21:21:00 +02007159/*
7160 * Decode a byte string from a code page into unicode object with an error
7161 * handler.
7162 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007163 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 * UnicodeDecodeError exception and returns -1 on error.
7165 */
7166static int
7167decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007168 PyObject **v,
7169 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007170 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007171{
7172 const char *startin = in;
7173 const char *endin = in + size;
7174 const DWORD flags = decode_code_page_flags(code_page);
7175 /* Ideally, we should get reason from FormatMessage. This is the Windows
7176 2000 English version of the message. */
7177 const char *reason = "No mapping for the Unicode character exists "
7178 "in the target code page.";
7179 /* each step cannot decode more than 1 character, but a character can be
7180 represented as a surrogate pair */
Miss Islington (bot)bdeb56c2018-12-03 01:09:11 -08007181 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007182 int insize;
7183 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 PyObject *errorHandler = NULL;
7185 PyObject *exc = NULL;
7186 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007187 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 DWORD err;
7189 int ret = -1;
7190
7191 assert(size > 0);
7192
7193 encoding = code_page_name(code_page, &encoding_obj);
7194 if (encoding == NULL)
7195 return -1;
7196
Victor Stinner7d00cc12014-03-17 23:08:06 +01007197 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7199 UnicodeDecodeError. */
7200 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7201 if (exc != NULL) {
7202 PyCodec_StrictErrors(exc);
7203 Py_CLEAR(exc);
7204 }
7205 goto error;
7206 }
7207
7208 if (*v == NULL) {
7209 /* Create unicode object */
7210 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7211 PyErr_NoMemory();
7212 goto error;
7213 }
Victor Stinnerab595942011-12-17 04:59:06 +01007214 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 if (*v == NULL)
7217 goto error;
Miss Islington (bot)bdeb56c2018-12-03 01:09:11 -08007218 out = PyUnicode_AS_UNICODE(*v);
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 }
7220 else {
7221 /* Extend unicode object */
7222 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7223 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7224 PyErr_NoMemory();
7225 goto error;
7226 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007227 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 goto error;
Miss Islington (bot)bdeb56c2018-12-03 01:09:11 -08007229 out = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 }
7231
7232 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 while (in < endin)
7234 {
7235 /* Decode a character */
7236 insize = 1;
7237 do
7238 {
7239 outsize = MultiByteToWideChar(code_page, flags,
7240 in, insize,
7241 buffer, Py_ARRAY_LENGTH(buffer));
7242 if (outsize > 0)
7243 break;
7244 err = GetLastError();
7245 if (err != ERROR_NO_UNICODE_TRANSLATION
7246 && err != ERROR_INSUFFICIENT_BUFFER)
7247 {
7248 PyErr_SetFromWindowsErr(0);
7249 goto error;
7250 }
7251 insize++;
7252 }
7253 /* 4=maximum length of a UTF-8 sequence */
7254 while (insize <= 4 && (in + insize) <= endin);
7255
7256 if (outsize <= 0) {
7257 Py_ssize_t startinpos, endinpos, outpos;
7258
Victor Stinner7d00cc12014-03-17 23:08:06 +01007259 /* last character in partial decode? */
7260 if (in + insize >= endin && !final)
7261 break;
7262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 startinpos = in - startin;
7264 endinpos = startinpos + 1;
7265 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007266 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 errors, &errorHandler,
7268 encoding, reason,
7269 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 {
7272 goto error;
7273 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007274 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 }
7276 else {
7277 in += insize;
7278 memcpy(out, buffer, outsize * sizeof(wchar_t));
7279 out += outsize;
7280 }
7281 }
7282
7283 /* write a NUL character at the end */
7284 *out = 0;
7285
7286 /* Extend unicode object */
Miss Islington (bot)bdeb56c2018-12-03 01:09:11 -08007287 outsize = out - PyUnicode_AS_UNICODE(*v);
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007289 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007291 /* (in - startin) <= size and size is an int */
7292 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007293
7294error:
7295 Py_XDECREF(encoding_obj);
7296 Py_XDECREF(errorHandler);
7297 Py_XDECREF(exc);
7298 return ret;
7299}
7300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301static PyObject *
7302decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 const char *s, Py_ssize_t size,
7304 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305{
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 PyObject *v = NULL;
7307 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 if (code_page < 0) {
7310 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7311 return NULL;
7312 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007313 if (size < 0) {
7314 PyErr_BadInternalCall();
7315 return NULL;
7316 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007317
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007318 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 do
7322 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 if (size > INT_MAX) {
7325 chunk_size = INT_MAX;
7326 final = 0;
7327 done = 0;
7328 }
7329 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 {
7332 chunk_size = (int)size;
7333 final = (consumed == NULL);
7334 done = 1;
7335 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 if (chunk_size == 0 && done) {
7338 if (v != NULL)
7339 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007340 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007342
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 converted = decode_code_page_strict(code_page, &v,
7344 s, chunk_size);
7345 if (converted == -2)
7346 converted = decode_code_page_errors(code_page, &v,
7347 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007348 errors, final);
7349 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007350
7351 if (converted < 0) {
7352 Py_XDECREF(v);
7353 return NULL;
7354 }
7355
7356 if (consumed)
7357 *consumed += converted;
7358
7359 s += converted;
7360 size -= converted;
7361 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007362
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007363 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364}
7365
Alexander Belopolsky40018472011-02-26 01:02:56 +00007366PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007367PyUnicode_DecodeCodePageStateful(int code_page,
7368 const char *s,
7369 Py_ssize_t size,
7370 const char *errors,
7371 Py_ssize_t *consumed)
7372{
7373 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7374}
7375
7376PyObject *
7377PyUnicode_DecodeMBCSStateful(const char *s,
7378 Py_ssize_t size,
7379 const char *errors,
7380 Py_ssize_t *consumed)
7381{
7382 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7383}
7384
7385PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007386PyUnicode_DecodeMBCS(const char *s,
7387 Py_ssize_t size,
7388 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007389{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7391}
7392
Victor Stinner3a50e702011-10-18 21:21:00 +02007393static DWORD
7394encode_code_page_flags(UINT code_page, const char *errors)
7395{
7396 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007397 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 }
7399 else if (code_page == CP_UTF7) {
7400 /* CP_UTF7 only supports flags=0 */
7401 return 0;
7402 }
7403 else {
7404 if (errors != NULL && strcmp(errors, "replace") == 0)
7405 return 0;
7406 else
7407 return WC_NO_BEST_FIT_CHARS;
7408 }
7409}
7410
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 * Encode a Unicode string to a Windows code page into a byte string in strict
7413 * mode.
7414 *
7415 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007416 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007418static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007419encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422{
Victor Stinner554f3f02010-06-16 23:33:54 +00007423 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 BOOL *pusedDefaultChar = &usedDefaultChar;
7425 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007426 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007427 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 const DWORD flags = encode_code_page_flags(code_page, NULL);
7429 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 /* Create a substring so that we can get the UTF-16 representation
7431 of just the slice under consideration. */
7432 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433
Martin v. Löwis3d325192011-11-04 18:23:06 +01007434 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007435
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007437 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007439 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007440
Victor Stinner2fc507f2011-11-04 20:06:39 +01007441 substring = PyUnicode_Substring(unicode, offset, offset+len);
7442 if (substring == NULL)
7443 return -1;
7444 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7445 if (p == NULL) {
7446 Py_DECREF(substring);
7447 return -1;
7448 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007451 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007453 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 NULL, 0,
7455 NULL, pusedDefaultChar);
7456 if (outsize <= 0)
7457 goto error;
7458 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007459 if (pusedDefaultChar && *pusedDefaultChar) {
7460 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007463
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 if (*outbytes == NULL) {
7468 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472 }
7473 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 const Py_ssize_t n = PyBytes_Size(*outbytes);
7476 if (outsize > PY_SSIZE_T_MAX - n) {
7477 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007478 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7482 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 }
7487
7488 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007490 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 out, outsize,
7492 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 if (outsize <= 0)
7495 goto error;
7496 if (pusedDefaultChar && *pusedDefaultChar)
7497 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7503 return -2;
7504 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007505 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007506}
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007509 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 * error handler.
7511 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007512 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 * -1 on other error.
7514 */
7515static int
7516encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007517 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007518 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007519{
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007521 Py_ssize_t pos = unicode_offset;
7522 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 /* Ideally, we should get reason from FormatMessage. This is the Windows
7524 2000 English version of the message. */
7525 const char *reason = "invalid character";
7526 /* 4=maximum length of a UTF-8 sequence */
7527 char buffer[4];
7528 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7529 Py_ssize_t outsize;
7530 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 PyObject *errorHandler = NULL;
7532 PyObject *exc = NULL;
7533 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007534 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 PyObject *rep;
7537 int ret = -1;
7538
7539 assert(insize > 0);
7540
7541 encoding = code_page_name(code_page, &encoding_obj);
7542 if (encoding == NULL)
7543 return -1;
7544
7545 if (errors == NULL || strcmp(errors, "strict") == 0) {
7546 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7547 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007548 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 if (exc != NULL) {
7550 PyCodec_StrictErrors(exc);
7551 Py_DECREF(exc);
7552 }
7553 Py_XDECREF(encoding_obj);
7554 return -1;
7555 }
7556
7557 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7558 pusedDefaultChar = &usedDefaultChar;
7559 else
7560 pusedDefaultChar = NULL;
7561
7562 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7563 PyErr_NoMemory();
7564 goto error;
7565 }
7566 outsize = insize * Py_ARRAY_LENGTH(buffer);
7567
7568 if (*outbytes == NULL) {
7569 /* Create string object */
7570 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7571 if (*outbytes == NULL)
7572 goto error;
7573 out = PyBytes_AS_STRING(*outbytes);
7574 }
7575 else {
7576 /* Extend string object */
7577 Py_ssize_t n = PyBytes_Size(*outbytes);
7578 if (n > PY_SSIZE_T_MAX - outsize) {
7579 PyErr_NoMemory();
7580 goto error;
7581 }
7582 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7583 goto error;
7584 out = PyBytes_AS_STRING(*outbytes) + n;
7585 }
7586
7587 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007588 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7591 wchar_t chars[2];
7592 int charsize;
7593 if (ch < 0x10000) {
7594 chars[0] = (wchar_t)ch;
7595 charsize = 1;
7596 }
7597 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007598 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7599 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007600 charsize = 2;
7601 }
7602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007604 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 buffer, Py_ARRAY_LENGTH(buffer),
7606 NULL, pusedDefaultChar);
7607 if (outsize > 0) {
7608 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7609 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007610 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 memcpy(out, buffer, outsize);
7612 out += outsize;
7613 continue;
7614 }
7615 }
7616 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7617 PyErr_SetFromWindowsErr(0);
7618 goto error;
7619 }
7620
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 rep = unicode_encode_call_errorhandler(
7622 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007624 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 if (rep == NULL)
7626 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007628
7629 if (PyBytes_Check(rep)) {
7630 outsize = PyBytes_GET_SIZE(rep);
7631 if (outsize != 1) {
7632 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7633 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7634 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7635 Py_DECREF(rep);
7636 goto error;
7637 }
7638 out = PyBytes_AS_STRING(*outbytes) + offset;
7639 }
7640 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7641 out += outsize;
7642 }
7643 else {
7644 Py_ssize_t i;
7645 enum PyUnicode_Kind kind;
7646 void *data;
7647
Benjamin Petersonbac79492012-01-14 13:34:47 -05007648 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 Py_DECREF(rep);
7650 goto error;
7651 }
7652
7653 outsize = PyUnicode_GET_LENGTH(rep);
7654 if (outsize != 1) {
7655 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7656 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7657 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7658 Py_DECREF(rep);
7659 goto error;
7660 }
7661 out = PyBytes_AS_STRING(*outbytes) + offset;
7662 }
7663 kind = PyUnicode_KIND(rep);
7664 data = PyUnicode_DATA(rep);
7665 for (i=0; i < outsize; i++) {
7666 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7667 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007668 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 encoding, unicode,
7670 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 "unable to encode error handler result to ASCII");
7672 Py_DECREF(rep);
7673 goto error;
7674 }
7675 *out = (unsigned char)ch;
7676 out++;
7677 }
7678 }
7679 Py_DECREF(rep);
7680 }
7681 /* write a NUL byte */
7682 *out = 0;
7683 outsize = out - PyBytes_AS_STRING(*outbytes);
7684 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7685 if (_PyBytes_Resize(outbytes, outsize) < 0)
7686 goto error;
7687 ret = 0;
7688
7689error:
7690 Py_XDECREF(encoding_obj);
7691 Py_XDECREF(errorHandler);
7692 Py_XDECREF(exc);
7693 return ret;
7694}
7695
Victor Stinner3a50e702011-10-18 21:21:00 +02007696static PyObject *
7697encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007698 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 const char *errors)
7700{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007701 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007703 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007704 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007705
Victor Stinner29dacf22015-01-26 16:41:32 +01007706 if (!PyUnicode_Check(unicode)) {
7707 PyErr_BadArgument();
7708 return NULL;
7709 }
7710
Benjamin Petersonbac79492012-01-14 13:34:47 -05007711 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007712 return NULL;
7713 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007714
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 if (code_page < 0) {
7716 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7717 return NULL;
7718 }
7719
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007721 return PyBytes_FromStringAndSize(NULL, 0);
7722
Victor Stinner7581cef2011-11-03 22:32:33 +01007723 offset = 0;
7724 do
7725 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007727 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007728 chunks. */
7729 if (len > INT_MAX/2) {
7730 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 done = 0;
7732 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007734#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 done = 1;
7738 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007739
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007741 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 errors);
7743 if (ret == -2)
7744 ret = encode_code_page_errors(code_page, &outbytes,
7745 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 if (ret < 0) {
7748 Py_XDECREF(outbytes);
7749 return NULL;
7750 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner7581cef2011-11-03 22:32:33 +01007752 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007753 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007754 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007755
Victor Stinner3a50e702011-10-18 21:21:00 +02007756 return outbytes;
7757}
7758
7759PyObject *
7760PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7761 Py_ssize_t size,
7762 const char *errors)
7763{
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007765 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007766 if (unicode == NULL)
7767 return NULL;
7768 res = encode_code_page(CP_ACP, unicode, errors);
7769 Py_DECREF(unicode);
7770 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007771}
7772
7773PyObject *
7774PyUnicode_EncodeCodePage(int code_page,
7775 PyObject *unicode,
7776 const char *errors)
7777{
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007779}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007780
Alexander Belopolsky40018472011-02-26 01:02:56 +00007781PyObject *
7782PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007783{
Victor Stinner7581cef2011-11-03 22:32:33 +01007784 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007785}
7786
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007787#undef NEED_RETRY
7788
Steve Dowercc16be82016-09-08 10:35:16 -07007789#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007790
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791/* --- Character Mapping Codec -------------------------------------------- */
7792
Victor Stinnerfb161b12013-04-18 01:44:27 +02007793static int
7794charmap_decode_string(const char *s,
7795 Py_ssize_t size,
7796 PyObject *mapping,
7797 const char *errors,
7798 _PyUnicodeWriter *writer)
7799{
7800 const char *starts = s;
7801 const char *e;
7802 Py_ssize_t startinpos, endinpos;
7803 PyObject *errorHandler = NULL, *exc = NULL;
7804 Py_ssize_t maplen;
7805 enum PyUnicode_Kind mapkind;
7806 void *mapdata;
7807 Py_UCS4 x;
7808 unsigned char ch;
7809
7810 if (PyUnicode_READY(mapping) == -1)
7811 return -1;
7812
7813 maplen = PyUnicode_GET_LENGTH(mapping);
7814 mapdata = PyUnicode_DATA(mapping);
7815 mapkind = PyUnicode_KIND(mapping);
7816
7817 e = s + size;
7818
7819 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7820 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7821 * is disabled in encoding aliases, latin1 is preferred because
7822 * its implementation is faster. */
7823 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7824 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7825 Py_UCS4 maxchar = writer->maxchar;
7826
7827 assert (writer->kind == PyUnicode_1BYTE_KIND);
7828 while (s < e) {
7829 ch = *s;
7830 x = mapdata_ucs1[ch];
7831 if (x > maxchar) {
7832 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7833 goto onError;
7834 maxchar = writer->maxchar;
7835 outdata = (Py_UCS1 *)writer->data;
7836 }
7837 outdata[writer->pos] = x;
7838 writer->pos++;
7839 ++s;
7840 }
7841 return 0;
7842 }
7843
7844 while (s < e) {
7845 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7846 enum PyUnicode_Kind outkind = writer->kind;
7847 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7848 if (outkind == PyUnicode_1BYTE_KIND) {
7849 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7850 Py_UCS4 maxchar = writer->maxchar;
7851 while (s < e) {
7852 ch = *s;
7853 x = mapdata_ucs2[ch];
7854 if (x > maxchar)
7855 goto Error;
7856 outdata[writer->pos] = x;
7857 writer->pos++;
7858 ++s;
7859 }
7860 break;
7861 }
7862 else if (outkind == PyUnicode_2BYTE_KIND) {
7863 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7864 while (s < e) {
7865 ch = *s;
7866 x = mapdata_ucs2[ch];
7867 if (x == 0xFFFE)
7868 goto Error;
7869 outdata[writer->pos] = x;
7870 writer->pos++;
7871 ++s;
7872 }
7873 break;
7874 }
7875 }
7876 ch = *s;
7877
7878 if (ch < maplen)
7879 x = PyUnicode_READ(mapkind, mapdata, ch);
7880 else
7881 x = 0xfffe; /* invalid value */
7882Error:
7883 if (x == 0xfffe)
7884 {
7885 /* undefined mapping */
7886 startinpos = s-starts;
7887 endinpos = startinpos+1;
7888 if (unicode_decode_call_errorhandler_writer(
7889 errors, &errorHandler,
7890 "charmap", "character maps to <undefined>",
7891 &starts, &e, &startinpos, &endinpos, &exc, &s,
7892 writer)) {
7893 goto onError;
7894 }
7895 continue;
7896 }
7897
7898 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7899 goto onError;
7900 ++s;
7901 }
7902 Py_XDECREF(errorHandler);
7903 Py_XDECREF(exc);
7904 return 0;
7905
7906onError:
7907 Py_XDECREF(errorHandler);
7908 Py_XDECREF(exc);
7909 return -1;
7910}
7911
7912static int
7913charmap_decode_mapping(const char *s,
7914 Py_ssize_t size,
7915 PyObject *mapping,
7916 const char *errors,
7917 _PyUnicodeWriter *writer)
7918{
7919 const char *starts = s;
7920 const char *e;
7921 Py_ssize_t startinpos, endinpos;
7922 PyObject *errorHandler = NULL, *exc = NULL;
7923 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007924 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007925
7926 e = s + size;
7927
7928 while (s < e) {
7929 ch = *s;
7930
7931 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7932 key = PyLong_FromLong((long)ch);
7933 if (key == NULL)
7934 goto onError;
7935
7936 item = PyObject_GetItem(mapping, key);
7937 Py_DECREF(key);
7938 if (item == NULL) {
7939 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7940 /* No mapping found means: mapping is undefined. */
7941 PyErr_Clear();
7942 goto Undefined;
7943 } else
7944 goto onError;
7945 }
7946
7947 /* Apply mapping */
7948 if (item == Py_None)
7949 goto Undefined;
7950 if (PyLong_Check(item)) {
7951 long value = PyLong_AS_LONG(item);
7952 if (value == 0xFFFE)
7953 goto Undefined;
7954 if (value < 0 || value > MAX_UNICODE) {
7955 PyErr_Format(PyExc_TypeError,
7956 "character mapping must be in range(0x%lx)",
7957 (unsigned long)MAX_UNICODE + 1);
7958 goto onError;
7959 }
7960
7961 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7962 goto onError;
7963 }
7964 else if (PyUnicode_Check(item)) {
7965 if (PyUnicode_READY(item) == -1)
7966 goto onError;
7967 if (PyUnicode_GET_LENGTH(item) == 1) {
7968 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7969 if (value == 0xFFFE)
7970 goto Undefined;
7971 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7972 goto onError;
7973 }
7974 else {
7975 writer->overallocate = 1;
7976 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7977 goto onError;
7978 }
7979 }
7980 else {
7981 /* wrong return value */
7982 PyErr_SetString(PyExc_TypeError,
7983 "character mapping must return integer, None or str");
7984 goto onError;
7985 }
7986 Py_CLEAR(item);
7987 ++s;
7988 continue;
7989
7990Undefined:
7991 /* undefined mapping */
7992 Py_CLEAR(item);
7993 startinpos = s-starts;
7994 endinpos = startinpos+1;
7995 if (unicode_decode_call_errorhandler_writer(
7996 errors, &errorHandler,
7997 "charmap", "character maps to <undefined>",
7998 &starts, &e, &startinpos, &endinpos, &exc, &s,
7999 writer)) {
8000 goto onError;
8001 }
8002 }
8003 Py_XDECREF(errorHandler);
8004 Py_XDECREF(exc);
8005 return 0;
8006
8007onError:
8008 Py_XDECREF(item);
8009 Py_XDECREF(errorHandler);
8010 Py_XDECREF(exc);
8011 return -1;
8012}
8013
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014PyObject *
8015PyUnicode_DecodeCharmap(const char *s,
8016 Py_ssize_t size,
8017 PyObject *mapping,
8018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008020 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 /* Default to Latin-1 */
8023 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008027 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008028 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008029 writer.min_length = size;
8030 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008032
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008033 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008036 }
8037 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008038 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008041 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008042
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008044 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046}
8047
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048/* Charmap encoding: the lookup table */
8049
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 PyObject_HEAD
8052 unsigned char level1[32];
8053 int count2, count3;
8054 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055};
8056
8057static PyObject*
8058encoding_map_size(PyObject *obj, PyObject* args)
8059{
8060 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008063}
8064
8065static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 PyDoc_STR("Return the size (in bytes) of this object") },
8068 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069};
8070
8071static void
8072encoding_map_dealloc(PyObject* o)
8073{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075}
8076
8077static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 "EncodingMap", /*tp_name*/
8080 sizeof(struct encoding_map), /*tp_basicsize*/
8081 0, /*tp_itemsize*/
8082 /* methods */
8083 encoding_map_dealloc, /*tp_dealloc*/
8084 0, /*tp_print*/
8085 0, /*tp_getattr*/
8086 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008087 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 0, /*tp_repr*/
8089 0, /*tp_as_number*/
8090 0, /*tp_as_sequence*/
8091 0, /*tp_as_mapping*/
8092 0, /*tp_hash*/
8093 0, /*tp_call*/
8094 0, /*tp_str*/
8095 0, /*tp_getattro*/
8096 0, /*tp_setattro*/
8097 0, /*tp_as_buffer*/
8098 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8099 0, /*tp_doc*/
8100 0, /*tp_traverse*/
8101 0, /*tp_clear*/
8102 0, /*tp_richcompare*/
8103 0, /*tp_weaklistoffset*/
8104 0, /*tp_iter*/
8105 0, /*tp_iternext*/
8106 encoding_map_methods, /*tp_methods*/
8107 0, /*tp_members*/
8108 0, /*tp_getset*/
8109 0, /*tp_base*/
8110 0, /*tp_dict*/
8111 0, /*tp_descr_get*/
8112 0, /*tp_descr_set*/
8113 0, /*tp_dictoffset*/
8114 0, /*tp_init*/
8115 0, /*tp_alloc*/
8116 0, /*tp_new*/
8117 0, /*tp_free*/
8118 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119};
8120
8121PyObject*
8122PyUnicode_BuildEncodingMap(PyObject* string)
8123{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 PyObject *result;
8125 struct encoding_map *mresult;
8126 int i;
8127 int need_dict = 0;
8128 unsigned char level1[32];
8129 unsigned char level2[512];
8130 unsigned char *mlevel1, *mlevel2, *mlevel3;
8131 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 int kind;
8133 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008134 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008137 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 PyErr_BadArgument();
8139 return NULL;
8140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 kind = PyUnicode_KIND(string);
8142 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008143 length = PyUnicode_GET_LENGTH(string);
8144 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 memset(level1, 0xFF, sizeof level1);
8146 memset(level2, 0xFF, sizeof level2);
8147
8148 /* If there isn't a one-to-one mapping of NULL to \0,
8149 or if there are non-BMP characters, we need to use
8150 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008153 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008154 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 ch = PyUnicode_READ(kind, data, i);
8156 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 need_dict = 1;
8158 break;
8159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 /* unmapped character */
8162 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 l1 = ch >> 11;
8164 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 if (level1[l1] == 0xFF)
8166 level1[l1] = count2++;
8167 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 }
8170
8171 if (count2 >= 0xFF || count3 >= 0xFF)
8172 need_dict = 1;
8173
8174 if (need_dict) {
8175 PyObject *result = PyDict_New();
8176 PyObject *key, *value;
8177 if (!result)
8178 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008181 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 if (!key || !value)
8183 goto failed1;
8184 if (PyDict_SetItem(result, key, value) == -1)
8185 goto failed1;
8186 Py_DECREF(key);
8187 Py_DECREF(value);
8188 }
8189 return result;
8190 failed1:
8191 Py_XDECREF(key);
8192 Py_XDECREF(value);
8193 Py_DECREF(result);
8194 return NULL;
8195 }
8196
8197 /* Create a three-level trie */
8198 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8199 16*count2 + 128*count3 - 1);
8200 if (!result)
8201 return PyErr_NoMemory();
8202 PyObject_Init(result, &EncodingMapType);
8203 mresult = (struct encoding_map*)result;
8204 mresult->count2 = count2;
8205 mresult->count3 = count3;
8206 mlevel1 = mresult->level1;
8207 mlevel2 = mresult->level23;
8208 mlevel3 = mresult->level23 + 16*count2;
8209 memcpy(mlevel1, level1, 32);
8210 memset(mlevel2, 0xFF, 16*count2);
8211 memset(mlevel3, 0, 128*count3);
8212 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008213 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8216 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 /* unmapped character */
8218 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008219 o1 = ch>>11;
8220 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i2 = 16*mlevel1[o1] + o2;
8222 if (mlevel2[i2] == 0xFF)
8223 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008224 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225 i3 = 128*mlevel2[i2] + o3;
8226 mlevel3[i3] = i;
8227 }
8228 return result;
8229}
8230
8231static int
Victor Stinner22168992011-11-20 17:09:18 +01008232encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233{
8234 struct encoding_map *map = (struct encoding_map*)mapping;
8235 int l1 = c>>11;
8236 int l2 = (c>>7) & 0xF;
8237 int l3 = c & 0x7F;
8238 int i;
8239
Victor Stinner22168992011-11-20 17:09:18 +01008240 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242 if (c == 0)
8243 return 0;
8244 /* level 1*/
8245 i = map->level1[l1];
8246 if (i == 0xFF) {
8247 return -1;
8248 }
8249 /* level 2*/
8250 i = map->level23[16*i+l2];
8251 if (i == 0xFF) {
8252 return -1;
8253 }
8254 /* level 3 */
8255 i = map->level23[16*map->count2 + 128*i + l3];
8256 if (i == 0) {
8257 return -1;
8258 }
8259 return i;
8260}
8261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262/* Lookup the character ch in the mapping. If the character
8263 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008264 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008265static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008266charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267{
Christian Heimes217cfd12007-12-02 14:31:20 +00008268 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 PyObject *x;
8270
8271 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 x = PyObject_GetItem(mapping, w);
8274 Py_DECREF(w);
8275 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8277 /* No mapping found means: mapping is undefined. */
8278 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008279 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 } else
8281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008283 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008285 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 long value = PyLong_AS_LONG(x);
8287 if (value < 0 || value > 255) {
8288 PyErr_SetString(PyExc_TypeError,
8289 "character mapping must be in range(256)");
8290 Py_DECREF(x);
8291 return NULL;
8292 }
8293 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008295 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 /* wrong return value */
8299 PyErr_Format(PyExc_TypeError,
8300 "character mapping must return integer, bytes or None, not %.400s",
8301 x->ob_type->tp_name);
8302 Py_DECREF(x);
8303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
8305}
8306
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008308charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8311 /* exponentially overallocate to minimize reallocations */
8312 if (requiredsize < 2*outsize)
8313 requiredsize = 2*outsize;
8314 if (_PyBytes_Resize(outobj, requiredsize))
8315 return -1;
8316 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317}
8318
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008323 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 space is available. Return a new reference to the object that
8325 was put in the output buffer, or Py_None, if the mapping was undefined
8326 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008327 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008329charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 PyObject *rep;
8333 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008334 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335
Christian Heimes90aa7642007-12-19 02:45:37 +00008336 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 if (res == -1)
8340 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 if (outsize<requiredsize)
8342 if (charmapencode_resize(outobj, outpos, requiredsize))
8343 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 outstart[(*outpos)++] = (char)res;
8346 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 }
8348
8349 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 Py_DECREF(rep);
8354 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 if (PyLong_Check(rep)) {
8357 Py_ssize_t requiredsize = *outpos+1;
8358 if (outsize<requiredsize)
8359 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8360 Py_DECREF(rep);
8361 return enc_EXCEPTION;
8362 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008363 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 else {
8367 const char *repchars = PyBytes_AS_STRING(rep);
8368 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8369 Py_ssize_t requiredsize = *outpos+repsize;
8370 if (outsize<requiredsize)
8371 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8372 Py_DECREF(rep);
8373 return enc_EXCEPTION;
8374 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008375 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 memcpy(outstart + *outpos, repchars, repsize);
8377 *outpos += repsize;
8378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 Py_DECREF(rep);
8381 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382}
8383
8384/* handle an error in PyUnicode_EncodeCharmap
8385 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386static int
8387charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008390 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008391 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392{
8393 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008396 enum PyUnicode_Kind kind;
8397 void *data;
8398 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008400 Py_ssize_t collstartpos = *inpos;
8401 Py_ssize_t collendpos = *inpos+1;
8402 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008403 const char *encoding = "charmap";
8404 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008407 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408
Benjamin Petersonbac79492012-01-14 13:34:47 -05008409 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008410 return -1;
8411 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 /* find all unencodable characters */
8413 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008415 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008417 val = encoding_map_lookup(ch, mapping);
8418 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 break;
8420 ++collendpos;
8421 continue;
8422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008424 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8425 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 if (rep==NULL)
8427 return -1;
8428 else if (rep!=Py_None) {
8429 Py_DECREF(rep);
8430 break;
8431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 }
8435 /* cache callback name lookup
8436 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008437 if (*error_handler == _Py_ERROR_UNKNOWN)
8438 *error_handler = get_error_handler(errors);
8439
8440 switch (*error_handler) {
8441 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008442 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008444
8445 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 x = charmapencode_output('?', mapping, res, respos);
8448 if (x==enc_EXCEPTION) {
8449 return -1;
8450 }
8451 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008452 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 return -1;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 }
8456 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008457 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 *inpos = collendpos;
8459 break;
Victor Stinner50149202015-09-22 00:26:54 +02008460
8461 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 /* generate replacement (temporarily (mis)uses p) */
8463 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 char buffer[2+29+1+1];
8465 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008466 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 for (cp = buffer; *cp; ++cp) {
8468 x = charmapencode_output(*cp, mapping, res, respos);
8469 if (x==enc_EXCEPTION)
8470 return -1;
8471 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008472 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 return -1;
8474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 }
8476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 *inpos = collendpos;
8478 break;
Victor Stinner50149202015-09-22 00:26:54 +02008479
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 default:
Victor Stinner50149202015-09-22 00:26:54 +02008481 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008482 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008486 if (PyBytes_Check(repunicode)) {
8487 /* Directly copy bytes result to output. */
8488 Py_ssize_t outsize = PyBytes_Size(*res);
8489 Py_ssize_t requiredsize;
8490 repsize = PyBytes_Size(repunicode);
8491 requiredsize = *respos + repsize;
8492 if (requiredsize > outsize)
8493 /* Make room for all additional bytes. */
8494 if (charmapencode_resize(res, respos, requiredsize)) {
8495 Py_DECREF(repunicode);
8496 return -1;
8497 }
8498 memcpy(PyBytes_AsString(*res) + *respos,
8499 PyBytes_AsString(repunicode), repsize);
8500 *respos += repsize;
8501 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008503 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008506 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008507 Py_DECREF(repunicode);
8508 return -1;
8509 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008510 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008511 data = PyUnicode_DATA(repunicode);
8512 kind = PyUnicode_KIND(repunicode);
8513 for (index = 0; index < repsize; index++) {
8514 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8515 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008517 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
8519 }
8520 else if (x==enc_FAILED) {
8521 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008522 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
8524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 }
8526 *inpos = newpos;
8527 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 }
8529 return 0;
8530}
8531
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008533_PyUnicode_EncodeCharmap(PyObject *unicode,
8534 PyObject *mapping,
8535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 /* output object */
8538 PyObject *res = NULL;
8539 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008544 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008546 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008547 void *data;
8548 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Benjamin Petersonbac79492012-01-14 13:34:47 -05008550 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551 return NULL;
8552 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008553 data = PyUnicode_DATA(unicode);
8554 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 /* Default to Latin-1 */
8557 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* allocate enough for a simple encoding without
8561 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008562 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (res == NULL)
8564 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008565 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008569 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 if (x==enc_EXCEPTION) /* error */
8573 goto onError;
8574 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008575 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008577 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 &res, &respos)) {
8579 goto onError;
8580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 else
8583 /* done with this character => adjust input position */
8584 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008588 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008589 if (_PyBytes_Resize(&res, respos) < 0)
8590 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008593 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 return res;
8595
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 Py_XDECREF(res);
8598 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008599 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 return NULL;
8601}
8602
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603/* Deprecated */
8604PyObject *
8605PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8606 Py_ssize_t size,
8607 PyObject *mapping,
8608 const char *errors)
8609{
8610 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008611 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 if (unicode == NULL)
8613 return NULL;
8614 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8615 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008616 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617}
8618
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619PyObject *
8620PyUnicode_AsCharmapString(PyObject *unicode,
8621 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
8623 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 PyErr_BadArgument();
8625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008627 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628}
8629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631static void
8632make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008634 Py_ssize_t startpos, Py_ssize_t endpos,
8635 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 *exceptionObject = _PyUnicodeTranslateError_Create(
8639 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 }
8641 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8643 goto onError;
8644 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8645 goto onError;
8646 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8647 goto onError;
8648 return;
8649 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008650 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
8652}
8653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654/* error handling callback helper:
8655 build arguments, call the callback and check the arguments,
8656 put the result into newpos and return the replacement string, which
8657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
8659unicode_translate_call_errorhandler(const char *errors,
8660 PyObject **errorHandler,
8661 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663 Py_ssize_t startpos, Py_ssize_t endpos,
8664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008666 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008668 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *restuple;
8670 PyObject *resunicode;
8671
8672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677
8678 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008683 restuple = PyObject_CallFunctionObjArgs(
8684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008688 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008692 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 &resunicode, &i_newpos)) {
8694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 else
8700 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008702 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 Py_DECREF(restuple);
8704 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 Py_INCREF(resunicode);
8707 Py_DECREF(restuple);
8708 return resunicode;
8709}
8710
8711/* Lookup the character ch in the mapping and put the result in result,
8712 which must be decrefed by the caller.
8713 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716{
Christian Heimes217cfd12007-12-02 14:31:20 +00008717 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 PyObject *x;
8719
8720 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 x = PyObject_GetItem(mapping, w);
8723 Py_DECREF(w);
8724 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8726 /* No mapping found means: use 1:1 mapping. */
8727 PyErr_Clear();
8728 *result = NULL;
8729 return 0;
8730 } else
8731 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
8733 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 *result = x;
8735 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008737 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008739 if (value < 0 || value > MAX_UNICODE) {
8740 PyErr_Format(PyExc_ValueError,
8741 "character mapping must be in range(0x%x)",
8742 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_DECREF(x);
8744 return -1;
8745 }
8746 *result = x;
8747 return 0;
8748 }
8749 else if (PyUnicode_Check(x)) {
8750 *result = x;
8751 return 0;
8752 }
8753 else {
8754 /* wrong return value */
8755 PyErr_SetString(PyExc_TypeError,
8756 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 Py_DECREF(x);
8758 return -1;
8759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760}
Victor Stinner1194ea02014-04-04 19:37:40 +02008761
8762/* lookup the character, write the result into the writer.
8763 Return 1 if the result was written into the writer, return 0 if the mapping
8764 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008765static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008766charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8767 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 PyObject *item;
8770
8771 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008773
8774 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008776 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008779 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008781
8782 if (item == Py_None) {
8783 Py_DECREF(item);
8784 return 0;
8785 }
8786
8787 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008788 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8789 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8790 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008791 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8792 Py_DECREF(item);
8793 return -1;
8794 }
8795 Py_DECREF(item);
8796 return 1;
8797 }
8798
8799 if (!PyUnicode_Check(item)) {
8800 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008802 }
8803
8804 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8805 Py_DECREF(item);
8806 return -1;
8807 }
8808
8809 Py_DECREF(item);
8810 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811}
8812
Victor Stinner89a76ab2014-04-05 11:44:04 +02008813static int
8814unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8815 Py_UCS1 *translate)
8816{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008817 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008818 int ret = 0;
8819
Victor Stinner89a76ab2014-04-05 11:44:04 +02008820 if (charmaptranslate_lookup(ch, mapping, &item)) {
8821 return -1;
8822 }
8823
8824 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008825 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008826 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008829 /* not found => default to 1:1 mapping */
8830 translate[ch] = ch;
8831 return 1;
8832 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008833 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008834 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008835 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8836 used it */
8837 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838 /* invalid character or character outside ASCII:
8839 skip the fast translate */
8840 goto exit;
8841 }
8842 translate[ch] = (Py_UCS1)replace;
8843 }
8844 else if (PyUnicode_Check(item)) {
8845 Py_UCS4 replace;
8846
8847 if (PyUnicode_READY(item) == -1) {
8848 Py_DECREF(item);
8849 return -1;
8850 }
8851 if (PyUnicode_GET_LENGTH(item) != 1)
8852 goto exit;
8853
8854 replace = PyUnicode_READ_CHAR(item, 0);
8855 if (replace > 127)
8856 goto exit;
8857 translate[ch] = (Py_UCS1)replace;
8858 }
8859 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 goto exit;
8862 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 ret = 1;
8864
Benjamin Peterson1365de72014-04-07 20:15:41 -04008865 exit:
8866 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 return ret;
8868}
8869
8870/* Fast path for ascii => ascii translation. Return 1 if the whole string
8871 was translated into writer, return 0 if the input string was partially
8872 translated into writer, raise an exception and return -1 on error. */
8873static int
8874unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008875 _PyUnicodeWriter *writer, int ignore,
8876 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877{
Victor Stinner872b2912014-04-05 14:27:07 +02008878 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 Py_ssize_t len;
8880 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008881 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883 len = PyUnicode_GET_LENGTH(input);
8884
Victor Stinner872b2912014-04-05 14:27:07 +02008885 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886
8887 in = PyUnicode_1BYTE_DATA(input);
8888 end = in + len;
8889
8890 assert(PyUnicode_IS_ASCII(writer->buffer));
8891 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8892 out = PyUnicode_1BYTE_DATA(writer->buffer);
8893
Victor Stinner872b2912014-04-05 14:27:07 +02008894 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008896 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008898 int translate = unicode_fast_translate_lookup(mapping, ch,
8899 ascii_table);
8900 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008902 if (translate == 0)
8903 goto exit;
8904 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008905 }
Victor Stinner872b2912014-04-05 14:27:07 +02008906 if (ch2 == 0xfe) {
8907 if (ignore)
8908 continue;
8909 goto exit;
8910 }
8911 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008913 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 }
Victor Stinner872b2912014-04-05 14:27:07 +02008915 res = 1;
8916
8917exit:
8918 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008919 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008920 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921}
8922
Victor Stinner3222da22015-10-01 22:07:32 +02008923static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924_PyUnicode_TranslateCharmap(PyObject *input,
8925 PyObject *mapping,
8926 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 Py_ssize_t size, i;
8931 int kind;
8932 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 _PyUnicodeWriter writer;
8934 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008935 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936 PyObject *errorHandler = NULL;
8937 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 PyErr_BadArgument();
8943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 if (PyUnicode_READY(input) == -1)
8947 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008948 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind = PyUnicode_KIND(input);
8950 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008952 if (size == 0)
8953 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 /* allocate enough for a simple 1:1 translation without
8956 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008957 _PyUnicodeWriter_Init(&writer);
8958 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960
Victor Stinner872b2912014-04-05 14:27:07 +02008961 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8962
Victor Stinner33798672016-03-01 21:59:58 +01008963 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008965 if (PyUnicode_IS_ASCII(input)) {
8966 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8967 if (res < 0) {
8968 _PyUnicodeWriter_Dealloc(&writer);
8969 return NULL;
8970 }
8971 if (res == 1)
8972 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973 }
Victor Stinner33798672016-03-01 21:59:58 +01008974 else {
8975 i = 0;
8976 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 int translate;
8981 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8982 Py_ssize_t newpos;
8983 /* startpos for collecting untranslatable chars */
8984 Py_ssize_t collstart;
8985 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 ch = PyUnicode_READ(kind, data, i);
8989 translate = charmaptranslate_output(ch, mapping, &writer);
8990 if (translate < 0)
8991 goto onError;
8992
8993 if (translate != 0) {
8994 /* it worked => adjust input pointer */
8995 ++i;
8996 continue;
8997 }
8998
8999 /* untranslatable character */
9000 collstart = i;
9001 collend = i+1;
9002
9003 /* find all untranslatable characters */
9004 while (collend < size) {
9005 PyObject *x;
9006 ch = PyUnicode_READ(kind, data, collend);
9007 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 Py_XDECREF(x);
9010 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 ++collend;
9013 }
9014
9015 if (ignore) {
9016 i = collend;
9017 }
9018 else {
9019 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9020 reason, input, &exc,
9021 collstart, collend, &newpos);
9022 if (repunicode == NULL)
9023 goto onError;
9024 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009026 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009027 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009028 Py_DECREF(repunicode);
9029 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 }
9031 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 Py_XDECREF(exc);
9033 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 Py_XDECREF(exc);
9039 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 return NULL;
9041}
9042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043/* Deprecated. Use PyUnicode_Translate instead. */
9044PyObject *
9045PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9046 Py_ssize_t size,
9047 PyObject *mapping,
9048 const char *errors)
9049{
Christian Heimes5f520f42012-09-11 14:03:25 +02009050 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009051 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (!unicode)
9053 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009054 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9055 Py_DECREF(unicode);
9056 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057}
9058
Alexander Belopolsky40018472011-02-26 01:02:56 +00009059PyObject *
9060PyUnicode_Translate(PyObject *str,
9061 PyObject *mapping,
9062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009064 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009065 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009066 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067}
Tim Petersced69f82003-09-16 20:30:58 +00009068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069PyObject *
9070_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9071{
9072 if (!PyUnicode_Check(unicode)) {
9073 PyErr_BadInternalCall();
9074 return NULL;
9075 }
9076 if (PyUnicode_READY(unicode) == -1)
9077 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009078 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 /* If the string is already ASCII, just return the same string */
9080 Py_INCREF(unicode);
9081 return unicode;
9082 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009083
9084 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9085 PyObject *result = PyUnicode_New(len, 127);
9086 if (result == NULL) {
9087 return NULL;
9088 }
9089
9090 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9091 int kind = PyUnicode_KIND(unicode);
9092 const void *data = PyUnicode_DATA(unicode);
9093 Py_ssize_t i;
9094 for (i = 0; i < len; ++i) {
9095 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9096 if (ch < 127) {
9097 out[i] = ch;
9098 }
9099 else if (Py_UNICODE_ISSPACE(ch)) {
9100 out[i] = ' ';
9101 }
9102 else {
9103 int decimal = Py_UNICODE_TODECIMAL(ch);
9104 if (decimal < 0) {
9105 out[i] = '?';
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009106 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009107 _PyUnicode_LENGTH(result) = i + 1;
9108 break;
9109 }
9110 out[i] = '0' + decimal;
9111 }
9112 }
9113
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009114 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009115 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116}
9117
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118PyObject *
9119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9120 Py_ssize_t length)
9121{
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 Py_UCS4 maxchar;
9125 enum PyUnicode_Kind kind;
9126 void *data;
9127
Victor Stinner99d7ad02012-02-22 13:37:39 +01009128 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009130 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009134 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009135 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 }
9137 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009138
9139 /* Copy to a new string */
9140 decimal = PyUnicode_New(length, maxchar);
9141 if (decimal == NULL)
9142 return decimal;
9143 kind = PyUnicode_KIND(decimal);
9144 data = PyUnicode_DATA(decimal);
9145 /* Iterate over code points */
9146 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009147 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009148 if (ch > 127) {
9149 int decimal = Py_UNICODE_TODECIMAL(ch);
9150 if (decimal >= 0)
9151 ch = '0' + decimal;
9152 }
9153 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009155 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157/* --- Decimal Encoder ---------------------------------------------------- */
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159int
9160PyUnicode_EncodeDecimal(Py_UNICODE *s,
9161 Py_ssize_t length,
9162 char *output,
9163 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009164{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009165 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009166 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009167 enum PyUnicode_Kind kind;
9168 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169
9170 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 PyErr_BadArgument();
9172 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173 }
9174
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009175 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009176 if (unicode == NULL)
9177 return -1;
9178
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009293 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009311 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009321/* _PyUnicode_InsertThousandsGrouping() helper functions */
9322#include "stringlib/localeutil.h"
9323
9324/**
9325 * InsertThousandsGrouping:
9326 * @writer: Unicode writer.
9327 * @n_buffer: Number of characters in @buffer.
9328 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9329 * @d_pos: Start of digits string.
9330 * @n_digits: The number of digits in the string, in which we want
9331 * to put the grouping chars.
9332 * @min_width: The minimum width of the digits in the output string.
9333 * Output will be zero-padded on the left to fill.
9334 * @grouping: see definition in localeconv().
9335 * @thousands_sep: see definition in localeconv().
9336 *
9337 * There are 2 modes: counting and filling. If @writer is NULL,
9338 * we are in counting mode, else filling mode.
9339 * If counting, the required buffer size is returned.
9340 * If filling, we know the buffer will be large enough, so we don't
9341 * need to pass in the buffer size.
9342 * Inserts thousand grouping characters (as defined by grouping and
9343 * thousands_sep) into @writer.
9344 *
9345 * Return value: -1 on error, number of characters otherwise.
9346 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009348_PyUnicode_InsertThousandsGrouping(
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009349 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009350 Py_ssize_t n_buffer,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009351 PyObject *digits,
9352 Py_ssize_t d_pos,
9353 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009354 Py_ssize_t min_width,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009355 const char *grouping,
9356 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358{
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009359 if (writer) {
9360 assert(digits != NULL);
9361 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 }
9363 else {
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009364 assert(digits == NULL);
9365 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009366 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009367 assert(0 <= d_pos);
9368 assert(0 <= n_digits);
9369 assert(0 <= min_width);
9370 assert(grouping != NULL);
9371
9372 if (digits != NULL) {
9373 if (PyUnicode_READY(digits) == -1) {
9374 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009375 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009376 }
9377 if (PyUnicode_READY(thousands_sep) == -1) {
9378 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 }
9380
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009381 Py_ssize_t count = 0;
9382 Py_ssize_t n_zeros;
9383 int loop_broken = 0;
9384 int use_separator = 0; /* First time through, don't append the
9385 separator. They only go between
9386 groups. */
9387 Py_ssize_t buffer_pos;
9388 Py_ssize_t digits_pos;
9389 Py_ssize_t len;
9390 Py_ssize_t n_chars;
9391 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9392 be looked at */
9393 /* A generator that returns all of the grouping widths, until it
9394 returns 0. */
9395 GroupGenerator groupgen;
9396 GroupGenerator_init(&groupgen, grouping);
9397 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9398
9399 /* if digits are not grouped, thousands separator
9400 should be an empty string */
9401 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9402
9403 digits_pos = d_pos + n_digits;
9404 if (writer) {
9405 buffer_pos = writer->pos + n_buffer;
9406 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9407 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009409 else {
9410 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009411 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009412
9413 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009414 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009415 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009416
9417 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9418 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9419 n_zeros = Py_MAX(0, len - remaining);
9420 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9421
9422 /* Use n_zero zero's and n_chars chars */
9423
9424 /* Count only, don't do anything. */
9425 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9426
9427 /* Copy into the writer. */
9428 InsertThousandsGrouping_fill(writer, &buffer_pos,
9429 digits, &digits_pos,
9430 n_chars, n_zeros,
9431 use_separator ? thousands_sep : NULL,
9432 thousands_sep_len, maxchar);
9433
9434 /* Use a separator next time. */
9435 use_separator = 1;
9436
9437 remaining -= n_chars;
9438 min_width -= len;
9439
9440 if (remaining <= 0 && min_width <= 0) {
9441 loop_broken = 1;
9442 break;
9443 }
9444 min_width -= thousands_sep_len;
9445 }
9446 if (!loop_broken) {
9447 /* We left the loop without using a break statement. */
9448
9449 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9450 n_zeros = Py_MAX(0, len - remaining);
9451 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9452
9453 /* Use n_zero zero's and n_chars chars */
9454 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9455
9456 /* Copy into the writer. */
9457 InsertThousandsGrouping_fill(writer, &buffer_pos,
9458 digits, &digits_pos,
9459 n_chars, n_zeros,
9460 use_separator ? thousands_sep : NULL,
9461 thousands_sep_len, maxchar);
9462 }
9463 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464}
9465
9466
Alexander Belopolsky40018472011-02-26 01:02:56 +00009467Py_ssize_t
9468PyUnicode_Count(PyObject *str,
9469 PyObject *substr,
9470 Py_ssize_t start,
9471 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009473 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009474 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 void *buf1 = NULL, *buf2 = NULL;
9476 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009477
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009480
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009481 kind1 = PyUnicode_KIND(str);
9482 kind2 = PyUnicode_KIND(substr);
9483 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009484 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009485
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009486 len1 = PyUnicode_GET_LENGTH(str);
9487 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009489 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009490 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009491
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009492 buf1 = PyUnicode_DATA(str);
9493 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009494 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009495 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009496 if (!buf2)
9497 goto onError;
9498 }
9499
9500 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009502 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009503 result = asciilib_count(
9504 ((Py_UCS1*)buf1) + start, end - start,
9505 buf2, len2, PY_SSIZE_T_MAX
9506 );
9507 else
9508 result = ucs1lib_count(
9509 ((Py_UCS1*)buf1) + start, end - start,
9510 buf2, len2, PY_SSIZE_T_MAX
9511 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 break;
9513 case PyUnicode_2BYTE_KIND:
9514 result = ucs2lib_count(
9515 ((Py_UCS2*)buf1) + start, end - start,
9516 buf2, len2, PY_SSIZE_T_MAX
9517 );
9518 break;
9519 case PyUnicode_4BYTE_KIND:
9520 result = ucs4lib_count(
9521 ((Py_UCS4*)buf1) + start, end - start,
9522 buf2, len2, PY_SSIZE_T_MAX
9523 );
9524 break;
9525 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009526 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009528
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009529 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 PyMem_Free(buf2);
9531
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009534 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 PyMem_Free(buf2);
9536 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537}
9538
Alexander Belopolsky40018472011-02-26 01:02:56 +00009539Py_ssize_t
9540PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009541 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009542 Py_ssize_t start,
9543 Py_ssize_t end,
9544 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009546 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009548
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009549 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550}
9551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552Py_ssize_t
9553PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9554 Py_ssize_t start, Py_ssize_t end,
9555 int direction)
9556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009558 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 if (PyUnicode_READY(str) == -1)
9560 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009561 len = PyUnicode_GET_LENGTH(str);
9562 ADJUST_INDICES(start, end, len);
9563 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009564 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009566 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9567 kind, end-start, ch, direction);
9568 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009570 else
9571 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572}
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009575tailmatch(PyObject *self,
9576 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577 Py_ssize_t start,
9578 Py_ssize_t end,
9579 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 int kind_self;
9582 int kind_sub;
9583 void *data_self;
9584 void *data_sub;
9585 Py_ssize_t offset;
9586 Py_ssize_t i;
9587 Py_ssize_t end_sub;
9588
9589 if (PyUnicode_READY(self) == -1 ||
9590 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009591 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9594 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009598 if (PyUnicode_GET_LENGTH(substring) == 0)
9599 return 1;
9600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 kind_self = PyUnicode_KIND(self);
9602 data_self = PyUnicode_DATA(self);
9603 kind_sub = PyUnicode_KIND(substring);
9604 data_sub = PyUnicode_DATA(substring);
9605 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9606
9607 if (direction > 0)
9608 offset = end;
9609 else
9610 offset = start;
9611
9612 if (PyUnicode_READ(kind_self, data_self, offset) ==
9613 PyUnicode_READ(kind_sub, data_sub, 0) &&
9614 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9615 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9616 /* If both are of the same kind, memcmp is sufficient */
9617 if (kind_self == kind_sub) {
9618 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009619 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 data_sub,
9621 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009622 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009624 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 else {
9626 /* We do not need to compare 0 and len(substring)-1 because
9627 the if statement above ensured already that they are equal
9628 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 for (i = 1; i < end_sub; ++i) {
9630 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9631 PyUnicode_READ(kind_sub, data_sub, i))
9632 return 0;
9633 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 }
9637
9638 return 0;
9639}
9640
Alexander Belopolsky40018472011-02-26 01:02:56 +00009641Py_ssize_t
9642PyUnicode_Tailmatch(PyObject *str,
9643 PyObject *substr,
9644 Py_ssize_t start,
9645 Py_ssize_t end,
9646 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009651 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654static PyObject *
9655ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9658 char *resdata, *data = PyUnicode_DATA(self);
9659 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009660
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 res = PyUnicode_New(len, 127);
9662 if (res == NULL)
9663 return NULL;
9664 resdata = PyUnicode_DATA(res);
9665 if (lower)
9666 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 _Py_bytes_upper(resdata, data, len);
9669 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
9671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 Py_ssize_t j;
9676 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009677 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009679
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9681
9682 where ! is a negation and \p{xxx} is a character with property xxx.
9683 */
9684 for (j = i - 1; j >= 0; j--) {
9685 c = PyUnicode_READ(kind, data, j);
9686 if (!_PyUnicode_IsCaseIgnorable(c))
9687 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9690 if (final_sigma) {
9691 for (j = i + 1; j < length; j++) {
9692 c = PyUnicode_READ(kind, data, j);
9693 if (!_PyUnicode_IsCaseIgnorable(c))
9694 break;
9695 }
9696 final_sigma = j == length || !_PyUnicode_IsCased(c);
9697 }
9698 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701static int
9702lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9703 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 /* Obscure special case. */
9706 if (c == 0x3A3) {
9707 mapped[0] = handle_capital_sigma(kind, data, length, i);
9708 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711}
9712
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713static Py_ssize_t
9714do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 Py_ssize_t i, k = 0;
9717 int n_res, j;
9718 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 c = PyUnicode_READ(kind, data, 0);
9721 n_res = _PyUnicode_ToUpperFull(c, mapped);
9722 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009723 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 for (i = 1; i < length; i++) {
9727 c = PyUnicode_READ(kind, data, i);
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009730 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009732 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009733 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737static Py_ssize_t
9738do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9739 Py_ssize_t i, k = 0;
9740
9741 for (i = 0; i < length; i++) {
9742 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9743 int n_res, j;
9744 if (Py_UNICODE_ISUPPER(c)) {
9745 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9746 }
9747 else if (Py_UNICODE_ISLOWER(c)) {
9748 n_res = _PyUnicode_ToUpperFull(c, mapped);
9749 }
9750 else {
9751 n_res = 1;
9752 mapped[0] = c;
9753 }
9754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
9757 }
9758 }
9759 return k;
9760}
9761
9762static Py_ssize_t
9763do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9764 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766 Py_ssize_t i, k = 0;
9767
9768 for (i = 0; i < length; i++) {
9769 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9770 int n_res, j;
9771 if (lower)
9772 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9773 else
9774 n_res = _PyUnicode_ToUpperFull(c, mapped);
9775 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009776 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 res[k++] = mapped[j];
9778 }
9779 }
9780 return k;
9781}
9782
9783static Py_ssize_t
9784do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9785{
9786 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9787}
9788
9789static Py_ssize_t
9790do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9793}
9794
Benjamin Petersone51757f2012-01-12 21:10:29 -05009795static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009796do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799
9800 for (i = 0; i < length; i++) {
9801 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9802 Py_UCS4 mapped[3];
9803 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9804 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009806 res[k++] = mapped[j];
9807 }
9808 }
9809 return k;
9810}
9811
9812static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009813do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9814{
9815 Py_ssize_t i, k = 0;
9816 int previous_is_cased;
9817
9818 previous_is_cased = 0;
9819 for (i = 0; i < length; i++) {
9820 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9821 Py_UCS4 mapped[3];
9822 int n_res, j;
9823
9824 if (previous_is_cased)
9825 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9826 else
9827 n_res = _PyUnicode_ToTitleFull(c, mapped);
9828
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009831 res[k++] = mapped[j];
9832 }
9833
9834 previous_is_cased = _PyUnicode_IsCased(c);
9835 }
9836 return k;
9837}
9838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839static PyObject *
9840case_operation(PyObject *self,
9841 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9842{
9843 PyObject *res = NULL;
9844 Py_ssize_t length, newlength = 0;
9845 int kind, outkind;
9846 void *data, *outdata;
9847 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9848
Benjamin Petersoneea48462012-01-16 14:28:50 -05009849 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009850
9851 kind = PyUnicode_KIND(self);
9852 data = PyUnicode_DATA(self);
9853 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009854 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009855 PyErr_SetString(PyExc_OverflowError, "string is too long");
9856 return NULL;
9857 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009858 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 if (tmp == NULL)
9860 return PyErr_NoMemory();
9861 newlength = perform(kind, data, length, tmp, &maxchar);
9862 res = PyUnicode_New(newlength, maxchar);
9863 if (res == NULL)
9864 goto leave;
9865 tmpend = tmp + newlength;
9866 outdata = PyUnicode_DATA(res);
9867 outkind = PyUnicode_KIND(res);
9868 switch (outkind) {
9869 case PyUnicode_1BYTE_KIND:
9870 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9871 break;
9872 case PyUnicode_2BYTE_KIND:
9873 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9874 break;
9875 case PyUnicode_4BYTE_KIND:
9876 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9877 break;
9878 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009879 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 }
9881 leave:
9882 PyMem_FREE(tmp);
9883 return res;
9884}
9885
Tim Peters8ce9f162004-08-27 01:49:32 +00009886PyObject *
9887PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 PyObject *res;
9890 PyObject *fseq;
9891 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009892 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009894 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009897 }
9898
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009899 /* NOTE: the following code can't call back into Python code,
9900 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009901 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009902
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009903 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009905 res = _PyUnicode_JoinArray(separator, items, seqlen);
9906 Py_DECREF(fseq);
9907 return res;
9908}
9909
9910PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009911_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009912{
9913 PyObject *res = NULL; /* the result */
9914 PyObject *sep = NULL;
9915 Py_ssize_t seplen;
9916 PyObject *item;
9917 Py_ssize_t sz, i, res_offset;
9918 Py_UCS4 maxchar;
9919 Py_UCS4 item_maxchar;
9920 int use_memcpy;
9921 unsigned char *res_data = NULL, *sep_data = NULL;
9922 PyObject *last_obj;
9923 unsigned int kind = 0;
9924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If empty sequence, return u"". */
9926 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009927 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009929
Tim Peters05eba1f2004-08-27 21:32:02 +00009930 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009931 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009932 if (seqlen == 1) {
9933 if (PyUnicode_CheckExact(items[0])) {
9934 res = items[0];
9935 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009936 return res;
9937 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009938 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009939 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009940 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009942 /* Set up sep and seplen */
9943 if (separator == NULL) {
9944 /* fall back to a blank space separator */
9945 sep = PyUnicode_FromOrdinal(' ');
9946 if (!sep)
9947 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009949 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009950 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009951 else {
9952 if (!PyUnicode_Check(separator)) {
9953 PyErr_Format(PyExc_TypeError,
9954 "separator: expected str instance,"
9955 " %.80s found",
9956 Py_TYPE(separator)->tp_name);
9957 goto onError;
9958 }
9959 if (PyUnicode_READY(separator))
9960 goto onError;
9961 sep = separator;
9962 seplen = PyUnicode_GET_LENGTH(separator);
9963 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9964 /* inc refcount to keep this code path symmetric with the
9965 above case of a blank separator */
9966 Py_INCREF(sep);
9967 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009968 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009969 }
9970
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009971 /* There are at least two things to join, or else we have a subclass
9972 * of str in the sequence.
9973 * Do a pre-pass to figure out the total amount of space we'll
9974 * need (sz), and see whether all argument are strings.
9975 */
9976 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977#ifdef Py_DEBUG
9978 use_memcpy = 0;
9979#else
9980 use_memcpy = 1;
9981#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009982 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009983 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009984 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 if (!PyUnicode_Check(item)) {
9986 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009987 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 " %.80s found",
9989 i, Py_TYPE(item)->tp_name);
9990 goto onError;
9991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (PyUnicode_READY(item) == -1)
9993 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009994 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009996 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009997 if (i != 0) {
9998 add_sz += seplen;
9999 }
10000 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 goto onError;
10004 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010005 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010006 if (use_memcpy && last_obj != NULL) {
10007 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10008 use_memcpy = 0;
10009 }
10010 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010011 }
Tim Petersced69f82003-09-16 20:30:58 +000010012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 if (res == NULL)
10015 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010016
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010018#ifdef Py_DEBUG
10019 use_memcpy = 0;
10020#else
10021 if (use_memcpy) {
10022 res_data = PyUnicode_1BYTE_DATA(res);
10023 kind = PyUnicode_KIND(res);
10024 if (seplen != 0)
10025 sep_data = PyUnicode_1BYTE_DATA(sep);
10026 }
10027#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010028 if (use_memcpy) {
10029 for (i = 0; i < seqlen; ++i) {
10030 Py_ssize_t itemlen;
10031 item = items[i];
10032
10033 /* Copy item, and maybe the separator. */
10034 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010035 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010036 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 kind * seplen);
10038 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010040
10041 itemlen = PyUnicode_GET_LENGTH(item);
10042 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010043 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010044 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010045 kind * itemlen);
10046 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010048 }
10049 assert(res_data == PyUnicode_1BYTE_DATA(res)
10050 + kind * PyUnicode_GET_LENGTH(res));
10051 }
10052 else {
10053 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10054 Py_ssize_t itemlen;
10055 item = items[i];
10056
10057 /* Copy item, and maybe the separator. */
10058 if (i && seplen != 0) {
10059 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10060 res_offset += seplen;
10061 }
10062
10063 itemlen = PyUnicode_GET_LENGTH(item);
10064 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010065 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010066 res_offset += itemlen;
10067 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010068 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010070 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010073 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010078 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 return NULL;
10080}
10081
Victor Stinnerd3f08822012-05-29 12:57:52 +020010082void
10083_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10084 Py_UCS4 fill_char)
10085{
10086 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner7f9fb0f2018-11-27 12:42:04 +010010087 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010088 assert(PyUnicode_IS_READY(unicode));
10089 assert(unicode_modifiable(unicode));
10090 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10091 assert(start >= 0);
10092 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10093 FILL(kind, data, fill_char, start, length);
10094}
10095
Victor Stinner3fe55312012-01-04 00:33:50 +010010096Py_ssize_t
10097PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10098 Py_UCS4 fill_char)
10099{
10100 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010101
10102 if (!PyUnicode_Check(unicode)) {
10103 PyErr_BadInternalCall();
10104 return -1;
10105 }
10106 if (PyUnicode_READY(unicode) == -1)
10107 return -1;
10108 if (unicode_check_modifiable(unicode))
10109 return -1;
10110
Victor Stinnerd3f08822012-05-29 12:57:52 +020010111 if (start < 0) {
10112 PyErr_SetString(PyExc_IndexError, "string index out of range");
10113 return -1;
10114 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010115 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10116 PyErr_SetString(PyExc_ValueError,
10117 "fill character is bigger than "
10118 "the string maximum character");
10119 return -1;
10120 }
10121
10122 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10123 length = Py_MIN(maxlen, length);
10124 if (length <= 0)
10125 return 0;
10126
Victor Stinnerd3f08822012-05-29 12:57:52 +020010127 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010128 return length;
10129}
10130
Victor Stinner9310abb2011-10-05 00:59:23 +020010131static PyObject *
10132pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010133 Py_ssize_t left,
10134 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 PyObject *u;
10138 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010139 int kind;
10140 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
10142 if (left < 0)
10143 left = 0;
10144 if (right < 0)
10145 right = 0;
10146
Victor Stinnerc4b49542011-12-11 22:44:26 +010010147 if (left == 0 && right == 0)
10148 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10151 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010152 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10153 return NULL;
10154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010156 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010158 if (!u)
10159 return NULL;
10160
10161 kind = PyUnicode_KIND(u);
10162 data = PyUnicode_DATA(u);
10163 if (left)
10164 FILL(kind, data, fill, 0, left);
10165 if (right)
10166 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010167 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010168 assert(_PyUnicode_CheckConsistency(u, 1));
10169 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170}
10171
Alexander Belopolsky40018472011-02-26 01:02:56 +000010172PyObject *
10173PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010177 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179
Benjamin Petersonead6b532011-12-20 17:23:42 -060010180 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 if (PyUnicode_IS_ASCII(string))
10183 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 PyUnicode_GET_LENGTH(string), keepends);
10186 else
10187 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 break;
10191 case PyUnicode_2BYTE_KIND:
10192 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 PyUnicode_GET_LENGTH(string), keepends);
10195 break;
10196 case PyUnicode_4BYTE_KIND:
10197 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 PyUnicode_GET_LENGTH(string), keepends);
10200 break;
10201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010202 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205}
10206
Alexander Belopolsky40018472011-02-26 01:02:56 +000010207static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010208split(PyObject *self,
10209 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010210 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010212 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 void *buf1, *buf2;
10214 Py_ssize_t len1, len2;
10215 PyObject* out;
10216
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010218 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (PyUnicode_READY(self) == -1)
10221 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010224 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010226 if (PyUnicode_IS_ASCII(self))
10227 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010228 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 PyUnicode_GET_LENGTH(self), maxcount
10230 );
10231 else
10232 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010234 PyUnicode_GET_LENGTH(self), maxcount
10235 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 case PyUnicode_2BYTE_KIND:
10237 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 PyUnicode_GET_LENGTH(self), maxcount
10240 );
10241 case PyUnicode_4BYTE_KIND:
10242 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 PyUnicode_GET_LENGTH(self), maxcount
10245 );
10246 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010247 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 }
10249
10250 if (PyUnicode_READY(substring) == -1)
10251 return NULL;
10252
10253 kind1 = PyUnicode_KIND(self);
10254 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 len1 = PyUnicode_GET_LENGTH(self);
10256 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010257 if (kind1 < kind2 || len1 < len2) {
10258 out = PyList_New(1);
10259 if (out == NULL)
10260 return NULL;
10261 Py_INCREF(self);
10262 PyList_SET_ITEM(out, 0, self);
10263 return out;
10264 }
10265 buf1 = PyUnicode_DATA(self);
10266 buf2 = PyUnicode_DATA(substring);
10267 if (kind2 != kind1) {
10268 buf2 = _PyUnicode_AsKind(substring, kind1);
10269 if (!buf2)
10270 return NULL;
10271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010273 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010275 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10276 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 else
10279 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 break;
10282 case PyUnicode_2BYTE_KIND:
10283 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 break;
10286 case PyUnicode_4BYTE_KIND:
10287 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 break;
10290 default:
10291 out = NULL;
10292 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010293 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyMem_Free(buf2);
10295 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297
Alexander Belopolsky40018472011-02-26 01:02:56 +000010298static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010299rsplit(PyObject *self,
10300 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010301 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010302{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010303 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 void *buf1, *buf2;
10305 Py_ssize_t len1, len2;
10306 PyObject* out;
10307
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010308 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010309 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (PyUnicode_READY(self) == -1)
10312 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010315 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010317 if (PyUnicode_IS_ASCII(self))
10318 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 PyUnicode_GET_LENGTH(self), maxcount
10321 );
10322 else
10323 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 PyUnicode_GET_LENGTH(self), maxcount
10326 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 case PyUnicode_2BYTE_KIND:
10328 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
10332 case PyUnicode_4BYTE_KIND:
10333 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 PyUnicode_GET_LENGTH(self), maxcount
10336 );
10337 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010338 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 }
10340
10341 if (PyUnicode_READY(substring) == -1)
10342 return NULL;
10343
10344 kind1 = PyUnicode_KIND(self);
10345 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 len1 = PyUnicode_GET_LENGTH(self);
10347 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010348 if (kind1 < kind2 || len1 < len2) {
10349 out = PyList_New(1);
10350 if (out == NULL)
10351 return NULL;
10352 Py_INCREF(self);
10353 PyList_SET_ITEM(out, 0, self);
10354 return out;
10355 }
10356 buf1 = PyUnicode_DATA(self);
10357 buf2 = PyUnicode_DATA(substring);
10358 if (kind2 != kind1) {
10359 buf2 = _PyUnicode_AsKind(substring, kind1);
10360 if (!buf2)
10361 return NULL;
10362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010364 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010366 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10367 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010368 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369 else
10370 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010371 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 break;
10373 case PyUnicode_2BYTE_KIND:
10374 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 break;
10377 case PyUnicode_4BYTE_KIND:
10378 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 break;
10381 default:
10382 out = NULL;
10383 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010384 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 PyMem_Free(buf2);
10386 return out;
10387}
10388
10389static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10391 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010393 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10396 return asciilib_find(buf1, len1, buf2, len2, offset);
10397 else
10398 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 case PyUnicode_2BYTE_KIND:
10400 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10401 case PyUnicode_4BYTE_KIND:
10402 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10403 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010404 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405}
10406
10407static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010408anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10409 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010411 switch (kind) {
10412 case PyUnicode_1BYTE_KIND:
10413 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10414 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10415 else
10416 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10417 case PyUnicode_2BYTE_KIND:
10418 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10419 case PyUnicode_4BYTE_KIND:
10420 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10421 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010422 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010423}
10424
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010425static void
10426replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10427 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10428{
10429 int kind = PyUnicode_KIND(u);
10430 void *data = PyUnicode_DATA(u);
10431 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10432 if (kind == PyUnicode_1BYTE_KIND) {
10433 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10434 (Py_UCS1 *)data + len,
10435 u1, u2, maxcount);
10436 }
10437 else if (kind == PyUnicode_2BYTE_KIND) {
10438 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10439 (Py_UCS2 *)data + len,
10440 u1, u2, maxcount);
10441 }
10442 else {
10443 assert(kind == PyUnicode_4BYTE_KIND);
10444 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10445 (Py_UCS4 *)data + len,
10446 u1, u2, maxcount);
10447 }
10448}
10449
Alexander Belopolsky40018472011-02-26 01:02:56 +000010450static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451replace(PyObject *self, PyObject *str1,
10452 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 PyObject *u;
10455 char *sbuf = PyUnicode_DATA(self);
10456 char *buf1 = PyUnicode_DATA(str1);
10457 char *buf2 = PyUnicode_DATA(str2);
10458 int srelease = 0, release1 = 0, release2 = 0;
10459 int skind = PyUnicode_KIND(self);
10460 int kind1 = PyUnicode_KIND(str1);
10461 int kind2 = PyUnicode_KIND(str2);
10462 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10463 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10464 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010465 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010466 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467
10468 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010471 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472
Victor Stinner59de0ee2011-10-07 10:01:28 +020010473 if (str1 == str2)
10474 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475
Victor Stinner49a0a212011-10-12 23:46:10 +020010476 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010477 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10478 if (maxchar < maxchar_str1)
10479 /* substring too wide to be present */
10480 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010481 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10482 /* Replacing str1 with str2 may cause a maxchar reduction in the
10483 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010484 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010485 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010490 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010492 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010493 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010494 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010495
Victor Stinner69ed0f42013-04-09 21:48:24 +020010496 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010498 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010500 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010504
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010505 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10506 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 }
10508 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 int rkind = skind;
10510 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010511 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (kind1 < rkind) {
10514 /* widen substring */
10515 buf1 = _PyUnicode_AsKind(str1, rkind);
10516 if (!buf1) goto error;
10517 release1 = 1;
10518 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010520 if (i < 0)
10521 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (rkind > kind2) {
10523 /* widen replacement */
10524 buf2 = _PyUnicode_AsKind(str2, rkind);
10525 if (!buf2) goto error;
10526 release2 = 1;
10527 }
10528 else if (rkind < kind2) {
10529 /* widen self and buf1 */
10530 rkind = kind2;
10531 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010532 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 sbuf = _PyUnicode_AsKind(self, rkind);
10534 if (!sbuf) goto error;
10535 srelease = 1;
10536 buf1 = _PyUnicode_AsKind(str1, rkind);
10537 if (!buf1) goto error;
10538 release1 = 1;
10539 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 u = PyUnicode_New(slen, maxchar);
10541 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 assert(PyUnicode_KIND(u) == rkind);
10544 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010545
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010546 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010547 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010548 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010550 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010552
10553 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010555 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010556 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010557 if (i == -1)
10558 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 }
10566 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010568 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 int rkind = skind;
10570 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf1 = _PyUnicode_AsKind(str1, rkind);
10575 if (!buf1) goto error;
10576 release1 = 1;
10577 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010579 if (n == 0)
10580 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 buf2 = _PyUnicode_AsKind(str2, rkind);
10584 if (!buf2) goto error;
10585 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 rkind = kind2;
10590 sbuf = _PyUnicode_AsKind(self, rkind);
10591 if (!sbuf) goto error;
10592 srelease = 1;
10593 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010594 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 buf1 = _PyUnicode_AsKind(str1, rkind);
10596 if (!buf1) goto error;
10597 release1 = 1;
10598 }
10599 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10600 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010601 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 PyErr_SetString(PyExc_OverflowError,
10603 "replace string is too long");
10604 goto error;
10605 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010606 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010608 _Py_INCREF_UNICODE_EMPTY();
10609 if (!unicode_empty)
10610 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010611 u = unicode_empty;
10612 goto done;
10613 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010614 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 PyErr_SetString(PyExc_OverflowError,
10616 "replace string is too long");
10617 goto error;
10618 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 u = PyUnicode_New(new_size, maxchar);
10620 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 assert(PyUnicode_KIND(u) == rkind);
10623 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 ires = i = 0;
10625 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010626 while (n-- > 0) {
10627 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010628 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010629 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010630 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010631 if (j == -1)
10632 break;
10633 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 memcpy(res + rkind * ires,
10636 sbuf + rkind * i,
10637 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 }
10640 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010644 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010651 memcpy(res + rkind * ires,
10652 sbuf + rkind * i,
10653 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010654 }
10655 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 /* interleave */
10657 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010658 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 if (--n <= 0)
10663 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
10665 sbuf + rkind * i,
10666 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 ires++;
10668 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010670 memcpy(res + rkind * ires,
10671 sbuf + rkind * i,
10672 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010674 }
10675
10676 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010677 unicode_adjust_maxchar(&u);
10678 if (u == NULL)
10679 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010681
10682 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 if (srelease)
10684 PyMem_FREE(sbuf);
10685 if (release1)
10686 PyMem_FREE(buf1);
10687 if (release2)
10688 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010689 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010691
Benjamin Peterson29060642009-01-31 22:14:21 +000010692 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (srelease)
10695 PyMem_FREE(sbuf);
10696 if (release1)
10697 PyMem_FREE(buf1);
10698 if (release2)
10699 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010700 return unicode_result_unchanged(self);
10701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 error:
10703 if (srelease && sbuf)
10704 PyMem_FREE(sbuf);
10705 if (release1 && buf1)
10706 PyMem_FREE(buf1);
10707 if (release2 && buf2)
10708 PyMem_FREE(buf2);
10709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710}
10711
10712/* --- Unicode Object Methods --------------------------------------------- */
10713
INADA Naoki3ae20562017-01-16 20:41:20 +090010714/*[clinic input]
10715str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
INADA Naoki3ae20562017-01-16 20:41:20 +090010717Return a version of the string where each word is titlecased.
10718
10719More specifically, words start with uppercased characters and all remaining
10720cased characters have lower case.
10721[clinic start generated code]*/
10722
10723static PyObject *
10724unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010725/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010727 if (PyUnicode_READY(self) == -1)
10728 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010729 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730}
10731
INADA Naoki3ae20562017-01-16 20:41:20 +090010732/*[clinic input]
10733str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734
INADA Naoki3ae20562017-01-16 20:41:20 +090010735Return a capitalized version of the string.
10736
10737More specifically, make the first character have upper case and the rest lower
10738case.
10739[clinic start generated code]*/
10740
10741static PyObject *
10742unicode_capitalize_impl(PyObject *self)
10743/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010745 if (PyUnicode_READY(self) == -1)
10746 return NULL;
10747 if (PyUnicode_GET_LENGTH(self) == 0)
10748 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010749 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750}
10751
INADA Naoki3ae20562017-01-16 20:41:20 +090010752/*[clinic input]
10753str.casefold as unicode_casefold
10754
10755Return a version of the string suitable for caseless comparisons.
10756[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010757
10758static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010759unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010760/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010761{
10762 if (PyUnicode_READY(self) == -1)
10763 return NULL;
10764 if (PyUnicode_IS_ASCII(self))
10765 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010766 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010767}
10768
10769
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010770/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010771
10772static int
10773convert_uc(PyObject *obj, void *addr)
10774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010777 if (!PyUnicode_Check(obj)) {
10778 PyErr_Format(PyExc_TypeError,
10779 "The fill character must be a unicode character, "
10780 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010781 return 0;
10782 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010783 if (PyUnicode_READY(obj) < 0)
10784 return 0;
10785 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010786 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010787 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010788 return 0;
10789 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010790 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010792}
10793
INADA Naoki3ae20562017-01-16 20:41:20 +090010794/*[clinic input]
10795str.center as unicode_center
10796
10797 width: Py_ssize_t
10798 fillchar: Py_UCS4 = ' '
10799 /
10800
10801Return a centered string of length width.
10802
10803Padding is done using the specified fill character (default is a space).
10804[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805
10806static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010807unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10808/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010810 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811
Benjamin Petersonbac79492012-01-14 13:34:47 -050010812 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813 return NULL;
10814
Victor Stinnerc4b49542011-12-11 22:44:26 +010010815 if (PyUnicode_GET_LENGTH(self) >= width)
10816 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817
Victor Stinnerc4b49542011-12-11 22:44:26 +010010818 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819 left = marg / 2 + (marg & width & 1);
10820
Victor Stinner9310abb2011-10-05 00:59:23 +020010821 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822}
10823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824/* This function assumes that str1 and str2 are readied by the caller. */
10825
Marc-André Lemburge5034372000-08-08 08:04:29 +000010826static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010827unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010828{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010829#define COMPARE(TYPE1, TYPE2) \
10830 do { \
10831 TYPE1* p1 = (TYPE1 *)data1; \
10832 TYPE2* p2 = (TYPE2 *)data2; \
10833 TYPE1* end = p1 + len; \
10834 Py_UCS4 c1, c2; \
10835 for (; p1 != end; p1++, p2++) { \
10836 c1 = *p1; \
10837 c2 = *p2; \
10838 if (c1 != c2) \
10839 return (c1 < c2) ? -1 : 1; \
10840 } \
10841 } \
10842 while (0)
10843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 int kind1, kind2;
10845 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010846 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 kind1 = PyUnicode_KIND(str1);
10849 kind2 = PyUnicode_KIND(str2);
10850 data1 = PyUnicode_DATA(str1);
10851 data2 = PyUnicode_DATA(str2);
10852 len1 = PyUnicode_GET_LENGTH(str1);
10853 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010854 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010855
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 switch(kind1) {
10857 case PyUnicode_1BYTE_KIND:
10858 {
10859 switch(kind2) {
10860 case PyUnicode_1BYTE_KIND:
10861 {
10862 int cmp = memcmp(data1, data2, len);
10863 /* normalize result of memcmp() into the range [-1; 1] */
10864 if (cmp < 0)
10865 return -1;
10866 if (cmp > 0)
10867 return 1;
10868 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010869 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010870 case PyUnicode_2BYTE_KIND:
10871 COMPARE(Py_UCS1, Py_UCS2);
10872 break;
10873 case PyUnicode_4BYTE_KIND:
10874 COMPARE(Py_UCS1, Py_UCS4);
10875 break;
10876 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010877 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 }
10879 break;
10880 }
10881 case PyUnicode_2BYTE_KIND:
10882 {
10883 switch(kind2) {
10884 case PyUnicode_1BYTE_KIND:
10885 COMPARE(Py_UCS2, Py_UCS1);
10886 break;
10887 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010888 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010889 COMPARE(Py_UCS2, Py_UCS2);
10890 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010891 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 case PyUnicode_4BYTE_KIND:
10893 COMPARE(Py_UCS2, Py_UCS4);
10894 break;
10895 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010896 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010897 }
10898 break;
10899 }
10900 case PyUnicode_4BYTE_KIND:
10901 {
10902 switch(kind2) {
10903 case PyUnicode_1BYTE_KIND:
10904 COMPARE(Py_UCS4, Py_UCS1);
10905 break;
10906 case PyUnicode_2BYTE_KIND:
10907 COMPARE(Py_UCS4, Py_UCS2);
10908 break;
10909 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010910 {
10911#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10912 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10913 /* normalize result of wmemcmp() into the range [-1; 1] */
10914 if (cmp < 0)
10915 return -1;
10916 if (cmp > 0)
10917 return 1;
10918#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010919 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010920#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010921 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010922 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010923 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010924 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010925 }
10926 break;
10927 }
10928 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010929 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010930 }
10931
Victor Stinner770e19e2012-10-04 22:59:45 +020010932 if (len1 == len2)
10933 return 0;
10934 if (len1 < len2)
10935 return -1;
10936 else
10937 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010938
10939#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010940}
10941
Benjamin Peterson621b4302016-09-09 13:54:34 -070010942static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010943unicode_compare_eq(PyObject *str1, PyObject *str2)
10944{
10945 int kind;
10946 void *data1, *data2;
10947 Py_ssize_t len;
10948 int cmp;
10949
Victor Stinnere5567ad2012-10-23 02:48:49 +020010950 len = PyUnicode_GET_LENGTH(str1);
10951 if (PyUnicode_GET_LENGTH(str2) != len)
10952 return 0;
10953 kind = PyUnicode_KIND(str1);
10954 if (PyUnicode_KIND(str2) != kind)
10955 return 0;
10956 data1 = PyUnicode_DATA(str1);
10957 data2 = PyUnicode_DATA(str2);
10958
10959 cmp = memcmp(data1, data2, len * kind);
10960 return (cmp == 0);
10961}
10962
10963
Alexander Belopolsky40018472011-02-26 01:02:56 +000010964int
10965PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10968 if (PyUnicode_READY(left) == -1 ||
10969 PyUnicode_READY(right) == -1)
10970 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010971
10972 /* a string is equal to itself */
10973 if (left == right)
10974 return 0;
10975
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010976 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010978 PyErr_Format(PyExc_TypeError,
10979 "Can't compare %.100s and %.100s",
10980 left->ob_type->tp_name,
10981 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982 return -1;
10983}
10984
Martin v. Löwis5b222132007-06-10 09:51:05 +000010985int
10986PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 Py_ssize_t i;
10989 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010991 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992
Victor Stinner910337b2011-10-03 03:20:16 +020010993 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010994 if (!PyUnicode_IS_READY(uni)) {
10995 const wchar_t *ws = _PyUnicode_WSTR(uni);
10996 /* Compare Unicode string and source character set string */
10997 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10998 if (chr != ustr[i])
10999 return (chr < ustr[i]) ? -1 : 1;
11000 }
11001 /* This check keeps Python strings that end in '\0' from comparing equal
11002 to C strings identical up to that point. */
11003 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11004 return 1; /* uni is longer */
11005 if (ustr[i])
11006 return -1; /* str is longer */
11007 return 0;
11008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011010 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011011 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011012 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011013 size_t len, len2 = strlen(str);
11014 int cmp;
11015
11016 len = Py_MIN(len1, len2);
11017 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011018 if (cmp != 0) {
11019 if (cmp < 0)
11020 return -1;
11021 else
11022 return 1;
11023 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011024 if (len1 > len2)
11025 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011026 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011027 return -1; /* str is longer */
11028 return 0;
11029 }
11030 else {
11031 void *data = PyUnicode_DATA(uni);
11032 /* Compare Unicode string and source character set string */
11033 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011034 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011035 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11036 /* This check keeps Python strings that end in '\0' from comparing equal
11037 to C strings identical up to that point. */
11038 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11039 return 1; /* uni is longer */
11040 if (str[i])
11041 return -1; /* str is longer */
11042 return 0;
11043 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011044}
11045
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011046static int
11047non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11048{
11049 size_t i, len;
11050 const wchar_t *p;
11051 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11052 if (strlen(str) != len)
11053 return 0;
11054 p = _PyUnicode_WSTR(unicode);
11055 assert(p);
11056 for (i = 0; i < len; i++) {
11057 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011058 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011059 return 0;
11060 }
11061 return 1;
11062}
11063
11064int
11065_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11066{
11067 size_t len;
11068 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011069 assert(str);
11070#ifndef NDEBUG
11071 for (const char *p = str; *p; p++) {
11072 assert((unsigned char)*p < 128);
11073 }
11074#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011075 if (PyUnicode_READY(unicode) == -1) {
11076 /* Memory error or bad data */
11077 PyErr_Clear();
11078 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11079 }
11080 if (!PyUnicode_IS_ASCII(unicode))
11081 return 0;
11082 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11083 return strlen(str) == len &&
11084 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11085}
11086
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011087int
11088_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11089{
11090 PyObject *right_uni;
11091 Py_hash_t hash;
11092
11093 assert(_PyUnicode_CHECK(left));
11094 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011095#ifndef NDEBUG
11096 for (const char *p = right->string; *p; p++) {
11097 assert((unsigned char)*p < 128);
11098 }
11099#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011100
11101 if (PyUnicode_READY(left) == -1) {
11102 /* memory error or bad data */
11103 PyErr_Clear();
11104 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11105 }
11106
11107 if (!PyUnicode_IS_ASCII(left))
11108 return 0;
11109
11110 right_uni = _PyUnicode_FromId(right); /* borrowed */
11111 if (right_uni == NULL) {
11112 /* memory error or bad data */
11113 PyErr_Clear();
11114 return _PyUnicode_EqualToASCIIString(left, right->string);
11115 }
11116
11117 if (left == right_uni)
11118 return 1;
11119
11120 if (PyUnicode_CHECK_INTERNED(left))
11121 return 0;
11122
INADA Naoki7cc95f52018-01-28 02:07:09 +090011123 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011124 hash = _PyUnicode_HASH(left);
11125 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11126 return 0;
11127
11128 return unicode_compare_eq(left, right_uni);
11129}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011130
Alexander Belopolsky40018472011-02-26 01:02:56 +000011131PyObject *
11132PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011133{
11134 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011135
Victor Stinnere5567ad2012-10-23 02:48:49 +020011136 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11137 Py_RETURN_NOTIMPLEMENTED;
11138
11139 if (PyUnicode_READY(left) == -1 ||
11140 PyUnicode_READY(right) == -1)
11141 return NULL;
11142
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011143 if (left == right) {
11144 switch (op) {
11145 case Py_EQ:
11146 case Py_LE:
11147 case Py_GE:
11148 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011149 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011150 case Py_NE:
11151 case Py_LT:
11152 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011153 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011154 default:
11155 PyErr_BadArgument();
11156 return NULL;
11157 }
11158 }
11159 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011160 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011161 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011162 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011163 }
11164 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011165 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011166 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011167 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011168}
11169
Alexander Belopolsky40018472011-02-26 01:02:56 +000011170int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011171_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11172{
11173 return unicode_eq(aa, bb);
11174}
11175
11176int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011177PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011178{
Victor Stinner77282cb2013-04-14 19:22:47 +020011179 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 void *buf1, *buf2;
11181 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011182 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011183
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011184 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011186 "'in <string>' requires string as left operand, not %.100s",
11187 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011188 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011189 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011190 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011191 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011192 if (ensure_unicode(str) < 0)
11193 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011196 kind2 = PyUnicode_KIND(substr);
11197 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011198 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 len2 = PyUnicode_GET_LENGTH(substr);
11201 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011202 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011203 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011204 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011205 if (len2 == 1) {
11206 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11207 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011208 return result;
11209 }
11210 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011211 buf2 = _PyUnicode_AsKind(substr, kind1);
11212 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011213 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215
Victor Stinner77282cb2013-04-14 19:22:47 +020011216 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 case PyUnicode_1BYTE_KIND:
11218 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11219 break;
11220 case PyUnicode_2BYTE_KIND:
11221 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11222 break;
11223 case PyUnicode_4BYTE_KIND:
11224 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11225 break;
11226 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011227 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011229
Victor Stinner77282cb2013-04-14 19:22:47 +020011230 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 PyMem_Free(buf2);
11232
Guido van Rossum403d68b2000-03-13 15:55:09 +000011233 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011234}
11235
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236/* Concat to string or Unicode object giving a new Unicode object. */
11237
Alexander Belopolsky40018472011-02-26 01:02:56 +000011238PyObject *
11239PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011241 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011242 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011245 if (ensure_unicode(left) < 0)
11246 return NULL;
11247
11248 if (!PyUnicode_Check(right)) {
11249 PyErr_Format(PyExc_TypeError,
11250 "can only concatenate str (not \"%.200s\") to str",
11251 right->ob_type->tp_name);
11252 return NULL;
11253 }
11254 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
11257 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011258 if (left == unicode_empty)
11259 return PyUnicode_FromObject(right);
11260 if (right == unicode_empty)
11261 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011263 left_len = PyUnicode_GET_LENGTH(left);
11264 right_len = PyUnicode_GET_LENGTH(right);
11265 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011266 PyErr_SetString(PyExc_OverflowError,
11267 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011268 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011269 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011270 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011271
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011272 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11273 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011274 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011277 result = PyUnicode_New(new_len, maxchar);
11278 if (result == NULL)
11279 return NULL;
11280 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11281 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11282 assert(_PyUnicode_CheckConsistency(result, 1));
11283 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284}
11285
Walter Dörwald1ab83302007-05-18 17:15:44 +000011286void
Victor Stinner23e56682011-10-03 03:54:37 +020011287PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011288{
Victor Stinner23e56682011-10-03 03:54:37 +020011289 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011290 Py_UCS4 maxchar, maxchar2;
11291 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011292
11293 if (p_left == NULL) {
11294 if (!PyErr_Occurred())
11295 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011296 return;
11297 }
Victor Stinner23e56682011-10-03 03:54:37 +020011298 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011299 if (right == NULL || left == NULL
11300 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011301 if (!PyErr_Occurred())
11302 PyErr_BadInternalCall();
11303 goto error;
11304 }
11305
Benjamin Petersonbac79492012-01-14 13:34:47 -050011306 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011307 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011308 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011309 goto error;
11310
Victor Stinner488fa492011-12-12 00:01:39 +010011311 /* Shortcuts */
11312 if (left == unicode_empty) {
11313 Py_DECREF(left);
11314 Py_INCREF(right);
11315 *p_left = right;
11316 return;
11317 }
11318 if (right == unicode_empty)
11319 return;
11320
11321 left_len = PyUnicode_GET_LENGTH(left);
11322 right_len = PyUnicode_GET_LENGTH(right);
11323 if (left_len > PY_SSIZE_T_MAX - right_len) {
11324 PyErr_SetString(PyExc_OverflowError,
11325 "strings are too large to concat");
11326 goto error;
11327 }
11328 new_len = left_len + right_len;
11329
11330 if (unicode_modifiable(left)
11331 && PyUnicode_CheckExact(right)
11332 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011333 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11334 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011335 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011336 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011337 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11338 {
11339 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011340 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011341 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011342
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011343 /* copy 'right' into the newly allocated area of 'left' */
11344 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011345 }
Victor Stinner488fa492011-12-12 00:01:39 +010011346 else {
11347 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11348 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011349 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011350
Victor Stinner488fa492011-12-12 00:01:39 +010011351 /* Concat the two Unicode strings */
11352 res = PyUnicode_New(new_len, maxchar);
11353 if (res == NULL)
11354 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011355 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11356 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011357 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011358 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011359 }
11360 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011361 return;
11362
11363error:
Victor Stinner488fa492011-12-12 00:01:39 +010011364 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011365}
11366
11367void
11368PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11369{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011370 PyUnicode_Append(pleft, right);
11371 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011372}
11373
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011374/*
11375Wraps stringlib_parse_args_finds() and additionally ensures that the
11376first argument is a unicode object.
11377*/
11378
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011379static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011380parse_args_finds_unicode(const char * function_name, PyObject *args,
11381 PyObject **substring,
11382 Py_ssize_t *start, Py_ssize_t *end)
11383{
11384 if(stringlib_parse_args_finds(function_name, args, substring,
11385 start, end)) {
11386 if (ensure_unicode(*substring) < 0)
11387 return 0;
11388 return 1;
11389 }
11390 return 0;
11391}
11392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011393PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011396Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011397string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011398interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399
11400static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011401unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011403 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011404 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011405 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011407 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 void *buf1, *buf2;
11409 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011411 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 kind1 = PyUnicode_KIND(self);
11415 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011416 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011417 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 len1 = PyUnicode_GET_LENGTH(self);
11420 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011422 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011423 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011425 buf1 = PyUnicode_DATA(self);
11426 buf2 = PyUnicode_DATA(substring);
11427 if (kind2 != kind1) {
11428 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011429 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011430 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011431 }
11432 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 case PyUnicode_1BYTE_KIND:
11434 iresult = ucs1lib_count(
11435 ((Py_UCS1*)buf1) + start, end - start,
11436 buf2, len2, PY_SSIZE_T_MAX
11437 );
11438 break;
11439 case PyUnicode_2BYTE_KIND:
11440 iresult = ucs2lib_count(
11441 ((Py_UCS2*)buf1) + start, end - start,
11442 buf2, len2, PY_SSIZE_T_MAX
11443 );
11444 break;
11445 case PyUnicode_4BYTE_KIND:
11446 iresult = ucs4lib_count(
11447 ((Py_UCS4*)buf1) + start, end - start,
11448 buf2, len2, PY_SSIZE_T_MAX
11449 );
11450 break;
11451 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011452 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 }
11454
11455 result = PyLong_FromSsize_t(iresult);
11456
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011457 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 return result;
11461}
11462
INADA Naoki3ae20562017-01-16 20:41:20 +090011463/*[clinic input]
11464str.encode as unicode_encode
11465
11466 encoding: str(c_default="NULL") = 'utf-8'
11467 The encoding in which to encode the string.
11468 errors: str(c_default="NULL") = 'strict'
11469 The error handling scheme to use for encoding errors.
11470 The default is 'strict' meaning that encoding errors raise a
11471 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11472 'xmlcharrefreplace' as well as any other name registered with
11473 codecs.register_error that can handle UnicodeEncodeErrors.
11474
11475Encode the string using the codec registered for encoding.
11476[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011479unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011480/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011482 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011483}
11484
INADA Naoki3ae20562017-01-16 20:41:20 +090011485/*[clinic input]
11486str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
INADA Naoki3ae20562017-01-16 20:41:20 +090011488 tabsize: int = 8
11489
11490Return a copy where all tab characters are expanded using spaces.
11491
11492If tabsize is not given, a tab size of 8 characters is assumed.
11493[clinic start generated code]*/
11494
11495static PyObject *
11496unicode_expandtabs_impl(PyObject *self, int tabsize)
11497/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011499 Py_ssize_t i, j, line_pos, src_len, incr;
11500 Py_UCS4 ch;
11501 PyObject *u;
11502 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011503 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011504 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
Antoine Pitrou22425222011-10-04 19:10:51 +020011506 if (PyUnicode_READY(self) == -1)
11507 return NULL;
11508
Thomas Wouters7e474022000-07-16 12:04:32 +000011509 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 src_len = PyUnicode_GET_LENGTH(self);
11511 i = j = line_pos = 0;
11512 kind = PyUnicode_KIND(self);
11513 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011514 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011515 for (; i < src_len; i++) {
11516 ch = PyUnicode_READ(kind, src_data, i);
11517 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011518 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011520 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011522 goto overflow;
11523 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011525 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011529 goto overflow;
11530 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011532 if (ch == '\n' || ch == '\r')
11533 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011536 if (!found)
11537 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011538
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011540 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 if (!u)
11542 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011543 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
Antoine Pitroue71d5742011-10-04 15:55:09 +020011545 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Antoine Pitroue71d5742011-10-04 15:55:09 +020011547 for (; i < src_len; i++) {
11548 ch = PyUnicode_READ(kind, src_data, i);
11549 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 incr = tabsize - (line_pos % tabsize);
11552 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011553 FILL(kind, dest_data, ' ', j, incr);
11554 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011556 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011558 line_pos++;
11559 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011560 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 if (ch == '\n' || ch == '\r')
11562 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011564 }
11565 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011566 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011567
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011569 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571}
11572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011573PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575\n\
11576Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011577such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578arguments start and end are interpreted as in slice notation.\n\
11579\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011585 /* initialize variables to prevent gcc warning */
11586 PyObject *substring = NULL;
11587 Py_ssize_t start = 0;
11588 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011589 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011591 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011594 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011597 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 if (result == -2)
11600 return NULL;
11601
Christian Heimes217cfd12007-12-02 14:31:20 +000011602 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
11605static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011606unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011608 void *data;
11609 enum PyUnicode_Kind kind;
11610 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011611
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011612 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011613 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011615 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011616 if (PyUnicode_READY(self) == -1) {
11617 return NULL;
11618 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011619 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11620 PyErr_SetString(PyExc_IndexError, "string index out of range");
11621 return NULL;
11622 }
11623 kind = PyUnicode_KIND(self);
11624 data = PyUnicode_DATA(self);
11625 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011626 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Guido van Rossumc2504932007-09-18 19:42:40 +000011629/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011630 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011631static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011632unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633{
Guido van Rossumc2504932007-09-18 19:42:40 +000011634 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011635 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011636
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011637#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011638 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011639#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (_PyUnicode_HASH(self) != -1)
11641 return _PyUnicode_HASH(self);
11642 if (PyUnicode_READY(self) == -1)
11643 return -1;
11644 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011645 /*
11646 We make the hash of the empty string be 0, rather than using
11647 (prefix ^ suffix), since this slightly obfuscates the hash secret
11648 */
11649 if (len == 0) {
11650 _PyUnicode_HASH(self) = 0;
11651 return 0;
11652 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011653 x = _Py_HashBytes(PyUnicode_DATA(self),
11654 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011656 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657}
11658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011659PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011662Return the lowest index in S where substring sub is found, \n\
11663such that sub is contained within S[start:end]. Optional\n\
11664arguments start and end are interpreted as in slice notation.\n\
11665\n\
11666Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667
11668static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011671 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011672 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011673 PyObject *substring = NULL;
11674 Py_ssize_t start = 0;
11675 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011677 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011680 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011683 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (result == -2)
11686 return NULL;
11687
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 if (result < 0) {
11689 PyErr_SetString(PyExc_ValueError, "substring not found");
11690 return NULL;
11691 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011692
Christian Heimes217cfd12007-12-02 14:31:20 +000011693 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
INADA Naoki3ae20562017-01-16 20:41:20 +090011696/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011697str.isascii as unicode_isascii
11698
11699Return True if all characters in the string are ASCII, False otherwise.
11700
11701ASCII characters have code points in the range U+0000-U+007F.
11702Empty string is ASCII too.
11703[clinic start generated code]*/
11704
11705static PyObject *
11706unicode_isascii_impl(PyObject *self)
11707/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11708{
11709 if (PyUnicode_READY(self) == -1) {
11710 return NULL;
11711 }
11712 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11713}
11714
11715/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011716str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
INADA Naoki3ae20562017-01-16 20:41:20 +090011718Return True if the string is a lowercase string, False otherwise.
11719
11720A string is lowercase if all cased characters in the string are lowercase and
11721there is at least one cased character in the string.
11722[clinic start generated code]*/
11723
11724static PyObject *
11725unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011726/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 Py_ssize_t i, length;
11729 int kind;
11730 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 int cased;
11732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 if (PyUnicode_READY(self) == -1)
11734 return NULL;
11735 length = PyUnicode_GET_LENGTH(self);
11736 kind = PyUnicode_KIND(self);
11737 data = PyUnicode_DATA(self);
11738
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 if (length == 1)
11741 return PyBool_FromLong(
11742 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011744 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011746 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011747
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 for (i = 0; i < length; i++) {
11750 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011751
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011753 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 else if (!cased && Py_UNICODE_ISLOWER(ch))
11755 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011757 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
INADA Naoki3ae20562017-01-16 20:41:20 +090011760/*[clinic input]
11761str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
INADA Naoki3ae20562017-01-16 20:41:20 +090011763Return True if the string is an uppercase string, False otherwise.
11764
11765A string is uppercase if all cased characters in the string are uppercase and
11766there is at least one cased character in the string.
11767[clinic start generated code]*/
11768
11769static PyObject *
11770unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011771/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 Py_ssize_t i, length;
11774 int kind;
11775 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 int cased;
11777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 if (PyUnicode_READY(self) == -1)
11779 return NULL;
11780 length = PyUnicode_GET_LENGTH(self);
11781 kind = PyUnicode_KIND(self);
11782 data = PyUnicode_DATA(self);
11783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 1)
11786 return PyBool_FromLong(
11787 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011789 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011791 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011792
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 for (i = 0; i < length; i++) {
11795 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011796
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011798 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 else if (!cased && Py_UNICODE_ISUPPER(ch))
11800 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011802 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803}
11804
INADA Naoki3ae20562017-01-16 20:41:20 +090011805/*[clinic input]
11806str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807
INADA Naoki3ae20562017-01-16 20:41:20 +090011808Return True if the string is a title-cased string, False otherwise.
11809
11810In a title-cased string, upper- and title-case characters may only
11811follow uncased characters and lowercase characters only cased ones.
11812[clinic start generated code]*/
11813
11814static PyObject *
11815unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011816/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 Py_ssize_t i, length;
11819 int kind;
11820 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 int cased, previous_is_cased;
11822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (PyUnicode_READY(self) == -1)
11824 return NULL;
11825 length = PyUnicode_GET_LENGTH(self);
11826 kind = PyUnicode_KIND(self);
11827 data = PyUnicode_DATA(self);
11828
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (length == 1) {
11831 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11832 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11833 (Py_UNICODE_ISUPPER(ch) != 0));
11834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011836 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011838 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011839
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 cased = 0;
11841 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 for (i = 0; i < length; i++) {
11843 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011844
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11846 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011847 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 previous_is_cased = 1;
11849 cased = 1;
11850 }
11851 else if (Py_UNICODE_ISLOWER(ch)) {
11852 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011853 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 previous_is_cased = 1;
11855 cased = 1;
11856 }
11857 else
11858 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011860 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861}
11862
INADA Naoki3ae20562017-01-16 20:41:20 +090011863/*[clinic input]
11864str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865
INADA Naoki3ae20562017-01-16 20:41:20 +090011866Return True if the string is a whitespace string, False otherwise.
11867
11868A string is whitespace if all characters in the string are whitespace and there
11869is at least one character in the string.
11870[clinic start generated code]*/
11871
11872static PyObject *
11873unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011874/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 Py_ssize_t i, length;
11877 int kind;
11878 void *data;
11879
11880 if (PyUnicode_READY(self) == -1)
11881 return NULL;
11882 length = PyUnicode_GET_LENGTH(self);
11883 kind = PyUnicode_KIND(self);
11884 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (length == 1)
11888 return PyBool_FromLong(
11889 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011891 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011893 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 for (i = 0; i < length; i++) {
11896 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011897 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011898 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011900 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901}
11902
INADA Naoki3ae20562017-01-16 20:41:20 +090011903/*[clinic input]
11904str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906Return True if the string is an alphabetic string, False otherwise.
11907
11908A string is alphabetic if all characters in the string are alphabetic and there
11909is at least one character in the string.
11910[clinic start generated code]*/
11911
11912static PyObject *
11913unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011914/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 Py_ssize_t i, length;
11917 int kind;
11918 void *data;
11919
11920 if (PyUnicode_READY(self) == -1)
11921 return NULL;
11922 length = PyUnicode_GET_LENGTH(self);
11923 kind = PyUnicode_KIND(self);
11924 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011925
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011926 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (length == 1)
11928 return PyBool_FromLong(
11929 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930
11931 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011933 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 for (i = 0; i < length; i++) {
11936 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011937 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011938 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011939 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011940}
11941
INADA Naoki3ae20562017-01-16 20:41:20 +090011942/*[clinic input]
11943str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011944
INADA Naoki3ae20562017-01-16 20:41:20 +090011945Return True if the string is an alpha-numeric string, False otherwise.
11946
11947A string is alpha-numeric if all characters in the string are alpha-numeric and
11948there is at least one character in the string.
11949[clinic start generated code]*/
11950
11951static PyObject *
11952unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011953/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011954{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 int kind;
11956 void *data;
11957 Py_ssize_t len, i;
11958
11959 if (PyUnicode_READY(self) == -1)
11960 return NULL;
11961
11962 kind = PyUnicode_KIND(self);
11963 data = PyUnicode_DATA(self);
11964 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011965
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (len == 1) {
11968 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11969 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11970 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971
11972 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011974 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 for (i = 0; i < len; i++) {
11977 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011978 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011979 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011981 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011982}
11983
INADA Naoki3ae20562017-01-16 20:41:20 +090011984/*[clinic input]
11985str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
INADA Naoki3ae20562017-01-16 20:41:20 +090011987Return True if the string is a decimal string, False otherwise.
11988
11989A string is a decimal string if all characters in the string are decimal and
11990there is at least one character in the string.
11991[clinic start generated code]*/
11992
11993static PyObject *
11994unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011995/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 Py_ssize_t i, length;
11998 int kind;
11999 void *data;
12000
12001 if (PyUnicode_READY(self) == -1)
12002 return NULL;
12003 length = PyUnicode_GET_LENGTH(self);
12004 kind = PyUnicode_KIND(self);
12005 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (length == 1)
12009 return PyBool_FromLong(
12010 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012012 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012014 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 for (i = 0; i < length; i++) {
12017 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012018 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012020 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021}
12022
INADA Naoki3ae20562017-01-16 20:41:20 +090012023/*[clinic input]
12024str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
INADA Naoki3ae20562017-01-16 20:41:20 +090012026Return True if the string is a digit string, False otherwise.
12027
12028A string is a digit string if all characters in the string are digits and there
12029is at least one character in the string.
12030[clinic start generated code]*/
12031
12032static PyObject *
12033unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012034/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 Py_ssize_t i, length;
12037 int kind;
12038 void *data;
12039
12040 if (PyUnicode_READY(self) == -1)
12041 return NULL;
12042 length = PyUnicode_GET_LENGTH(self);
12043 kind = PyUnicode_KIND(self);
12044 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 if (length == 1) {
12048 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12049 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012052 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012054 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 for (i = 0; i < length; i++) {
12057 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012058 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012060 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061}
12062
INADA Naoki3ae20562017-01-16 20:41:20 +090012063/*[clinic input]
12064str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
INADA Naoki3ae20562017-01-16 20:41:20 +090012066Return True if the string is a numeric string, False otherwise.
12067
12068A string is numeric if all characters in the string are numeric and there is at
12069least one character in the string.
12070[clinic start generated code]*/
12071
12072static PyObject *
12073unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012074/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 Py_ssize_t i, length;
12077 int kind;
12078 void *data;
12079
12080 if (PyUnicode_READY(self) == -1)
12081 return NULL;
12082 length = PyUnicode_GET_LENGTH(self);
12083 kind = PyUnicode_KIND(self);
12084 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 if (length == 1)
12088 return PyBool_FromLong(
12089 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012091 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012093 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 for (i = 0; i < length; i++) {
12096 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012097 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012099 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100}
12101
Martin v. Löwis47383402007-08-15 07:32:56 +000012102int
12103PyUnicode_IsIdentifier(PyObject *self)
12104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 int kind;
12106 void *data;
12107 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012108 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (PyUnicode_READY(self) == -1) {
12111 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 }
12114
12115 /* Special case for empty strings */
12116 if (PyUnicode_GET_LENGTH(self) == 0)
12117 return 0;
12118 kind = PyUnicode_KIND(self);
12119 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012120
12121 /* PEP 3131 says that the first character must be in
12122 XID_Start and subsequent characters in XID_Continue,
12123 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012124 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012125 letters, digits, underscore). However, given the current
12126 definition of XID_Start and XID_Continue, it is sufficient
12127 to check just for these, except that _ must be allowed
12128 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012130 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012131 return 0;
12132
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012133 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012136 return 1;
12137}
12138
INADA Naoki3ae20562017-01-16 20:41:20 +090012139/*[clinic input]
12140str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012141
INADA Naoki3ae20562017-01-16 20:41:20 +090012142Return True if the string is a valid Python identifier, False otherwise.
12143
12144Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12145"class".
12146[clinic start generated code]*/
12147
12148static PyObject *
12149unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012150/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012151{
12152 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12153}
12154
INADA Naoki3ae20562017-01-16 20:41:20 +090012155/*[clinic input]
12156str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012157
INADA Naoki3ae20562017-01-16 20:41:20 +090012158Return True if the string is printable, False otherwise.
12159
12160A string is printable if all of its characters are considered printable in
12161repr() or if it is empty.
12162[clinic start generated code]*/
12163
12164static PyObject *
12165unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012166/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 Py_ssize_t i, length;
12169 int kind;
12170 void *data;
12171
12172 if (PyUnicode_READY(self) == -1)
12173 return NULL;
12174 length = PyUnicode_GET_LENGTH(self);
12175 kind = PyUnicode_KIND(self);
12176 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012177
12178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (length == 1)
12180 return PyBool_FromLong(
12181 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 for (i = 0; i < length; i++) {
12184 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012185 Py_RETURN_FALSE;
12186 }
12187 }
12188 Py_RETURN_TRUE;
12189}
12190
INADA Naoki3ae20562017-01-16 20:41:20 +090012191/*[clinic input]
12192str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193
INADA Naoki3ae20562017-01-16 20:41:20 +090012194 iterable: object
12195 /
12196
12197Concatenate any number of strings.
12198
Martin Panter91a88662017-01-24 00:30:06 +000012199The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012200The result is returned as a new string.
12201
12202Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12203[clinic start generated code]*/
12204
12205static PyObject *
12206unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012207/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208{
INADA Naoki3ae20562017-01-16 20:41:20 +090012209 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210}
12211
Martin v. Löwis18e16552006-02-15 17:27:45 +000012212static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012213unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (PyUnicode_READY(self) == -1)
12216 return -1;
12217 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218}
12219
INADA Naoki3ae20562017-01-16 20:41:20 +090012220/*[clinic input]
12221str.ljust as unicode_ljust
12222
12223 width: Py_ssize_t
12224 fillchar: Py_UCS4 = ' '
12225 /
12226
12227Return a left-justified string of length width.
12228
12229Padding is done using the specified fill character (default is a space).
12230[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231
12232static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012233unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12234/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012236 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
Victor Stinnerc4b49542011-12-11 22:44:26 +010012239 if (PyUnicode_GET_LENGTH(self) >= width)
12240 return unicode_result_unchanged(self);
12241
12242 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243}
12244
INADA Naoki3ae20562017-01-16 20:41:20 +090012245/*[clinic input]
12246str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247
INADA Naoki3ae20562017-01-16 20:41:20 +090012248Return a copy of the string converted to lowercase.
12249[clinic start generated code]*/
12250
12251static PyObject *
12252unicode_lower_impl(PyObject *self)
12253/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012255 if (PyUnicode_READY(self) == -1)
12256 return NULL;
12257 if (PyUnicode_IS_ASCII(self))
12258 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012259 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260}
12261
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012262#define LEFTSTRIP 0
12263#define RIGHTSTRIP 1
12264#define BOTHSTRIP 2
12265
12266/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012267static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012268
INADA Naoki3ae20562017-01-16 20:41:20 +090012269#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012270
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012271/* externally visible for str.strip(unicode) */
12272PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012273_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 void *data;
12276 int kind;
12277 Py_ssize_t i, j, len;
12278 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012279 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12282 return NULL;
12283
12284 kind = PyUnicode_KIND(self);
12285 data = PyUnicode_DATA(self);
12286 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012287 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12289 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012290 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291
Benjamin Peterson14339b62009-01-31 16:36:08 +000012292 i = 0;
12293 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012294 while (i < len) {
12295 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12296 if (!BLOOM(sepmask, ch))
12297 break;
12298 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12299 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 i++;
12301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012302 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012303
Benjamin Peterson14339b62009-01-31 16:36:08 +000012304 j = len;
12305 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012306 j--;
12307 while (j >= i) {
12308 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12309 if (!BLOOM(sepmask, ch))
12310 break;
12311 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12312 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012314 }
12315
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012317 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318
Victor Stinner7931d9a2011-11-04 00:22:48 +010012319 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320}
12321
12322PyObject*
12323PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12324{
12325 unsigned char *data;
12326 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012327 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328
Victor Stinnerde636f32011-10-01 03:55:54 +020012329 if (PyUnicode_READY(self) == -1)
12330 return NULL;
12331
Victor Stinner684d5fd2012-05-03 02:32:34 +020012332 length = PyUnicode_GET_LENGTH(self);
12333 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012334
Victor Stinner684d5fd2012-05-03 02:32:34 +020012335 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012336 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337
Victor Stinnerde636f32011-10-01 03:55:54 +020012338 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012339 PyErr_SetString(PyExc_IndexError, "string index out of range");
12340 return NULL;
12341 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012342 if (start >= length || end < start)
12343 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012344
Victor Stinner684d5fd2012-05-03 02:32:34 +020012345 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012346 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012347 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012348 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012349 }
12350 else {
12351 kind = PyUnicode_KIND(self);
12352 data = PyUnicode_1BYTE_DATA(self);
12353 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012354 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012355 length);
12356 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
12359static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012360do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 Py_ssize_t len, i, j;
12363
12364 if (PyUnicode_READY(self) == -1)
12365 return NULL;
12366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012368
Victor Stinnercc7af722013-04-09 22:39:24 +020012369 if (PyUnicode_IS_ASCII(self)) {
12370 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12371
12372 i = 0;
12373 if (striptype != RIGHTSTRIP) {
12374 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012375 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012376 if (!_Py_ascii_whitespace[ch])
12377 break;
12378 i++;
12379 }
12380 }
12381
12382 j = len;
12383 if (striptype != LEFTSTRIP) {
12384 j--;
12385 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012386 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012387 if (!_Py_ascii_whitespace[ch])
12388 break;
12389 j--;
12390 }
12391 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 }
12393 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012394 else {
12395 int kind = PyUnicode_KIND(self);
12396 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012397
Victor Stinnercc7af722013-04-09 22:39:24 +020012398 i = 0;
12399 if (striptype != RIGHTSTRIP) {
12400 while (i < len) {
12401 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12402 if (!Py_UNICODE_ISSPACE(ch))
12403 break;
12404 i++;
12405 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012406 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012407
12408 j = len;
12409 if (striptype != LEFTSTRIP) {
12410 j--;
12411 while (j >= i) {
12412 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12413 if (!Py_UNICODE_ISSPACE(ch))
12414 break;
12415 j--;
12416 }
12417 j++;
12418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420
Victor Stinner7931d9a2011-11-04 00:22:48 +010012421 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422}
12423
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012424
12425static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012426do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012427{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 if (sep != NULL && sep != Py_None) {
12429 if (PyUnicode_Check(sep))
12430 return _PyUnicode_XStrip(self, striptype, sep);
12431 else {
12432 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 "%s arg must be None or str",
12434 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 return NULL;
12436 }
12437 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012438
Benjamin Peterson14339b62009-01-31 16:36:08 +000012439 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440}
12441
12442
INADA Naoki3ae20562017-01-16 20:41:20 +090012443/*[clinic input]
12444str.strip as unicode_strip
12445
12446 chars: object = None
12447 /
12448
Victor Stinner0c4a8282017-01-17 02:21:47 +010012449Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012450
12451If chars is given and not None, remove characters in chars instead.
12452[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012453
12454static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012455unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012456/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457{
INADA Naoki3ae20562017-01-16 20:41:20 +090012458 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459}
12460
12461
INADA Naoki3ae20562017-01-16 20:41:20 +090012462/*[clinic input]
12463str.lstrip as unicode_lstrip
12464
12465 chars: object = NULL
12466 /
12467
12468Return a copy of the string with leading whitespace removed.
12469
12470If chars is given and not None, remove characters in chars instead.
12471[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012472
12473static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012474unicode_lstrip_impl(PyObject *self, PyObject *chars)
12475/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476{
INADA Naoki3ae20562017-01-16 20:41:20 +090012477 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012478}
12479
12480
INADA Naoki3ae20562017-01-16 20:41:20 +090012481/*[clinic input]
12482str.rstrip as unicode_rstrip
12483
12484 chars: object = NULL
12485 /
12486
12487Return a copy of the string with trailing whitespace removed.
12488
12489If chars is given and not None, remove characters in chars instead.
12490[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012491
12492static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012493unicode_rstrip_impl(PyObject *self, PyObject *chars)
12494/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012495{
INADA Naoki3ae20562017-01-16 20:41:20 +090012496 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012497}
12498
12499
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012501unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012503 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505
Serhiy Storchaka05997252013-01-26 12:14:02 +020012506 if (len < 1)
12507 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
Victor Stinnerc4b49542011-12-11 22:44:26 +010012509 /* no repeat, return original string */
12510 if (len == 1)
12511 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012512
Benjamin Petersonbac79492012-01-14 13:34:47 -050012513 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 return NULL;
12515
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012516 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012517 PyErr_SetString(PyExc_OverflowError,
12518 "repeated string is too long");
12519 return NULL;
12520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012522
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012523 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524 if (!u)
12525 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012526 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 if (PyUnicode_GET_LENGTH(str) == 1) {
12529 const int kind = PyUnicode_KIND(str);
12530 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012531 if (kind == PyUnicode_1BYTE_KIND) {
12532 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012533 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012534 }
12535 else if (kind == PyUnicode_2BYTE_KIND) {
12536 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012537 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012538 ucs2[n] = fill_char;
12539 } else {
12540 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12541 assert(kind == PyUnicode_4BYTE_KIND);
12542 for (n = 0; n < len; ++n)
12543 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 }
12546 else {
12547 /* number of characters copied this far */
12548 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012549 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012551 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012555 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012556 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558 }
12559
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012560 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012561 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562}
12563
Alexander Belopolsky40018472011-02-26 01:02:56 +000012564PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012565PyUnicode_Replace(PyObject *str,
12566 PyObject *substr,
12567 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012568 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012570 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12571 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012573 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574}
12575
INADA Naoki3ae20562017-01-16 20:41:20 +090012576/*[clinic input]
12577str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578
INADA Naoki3ae20562017-01-16 20:41:20 +090012579 old: unicode
12580 new: unicode
12581 count: Py_ssize_t = -1
12582 Maximum number of occurrences to replace.
12583 -1 (the default value) means replace all occurrences.
12584 /
12585
12586Return a copy with all occurrences of substring old replaced by new.
12587
12588If the optional argument count is given, only the first count occurrences are
12589replaced.
12590[clinic start generated code]*/
12591
12592static PyObject *
12593unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12594 Py_ssize_t count)
12595/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012597 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012598 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012599 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600}
12601
Alexander Belopolsky40018472011-02-26 01:02:56 +000012602static PyObject *
12603unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012605 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 Py_ssize_t isize;
12607 Py_ssize_t osize, squote, dquote, i, o;
12608 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012609 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012613 return NULL;
12614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 isize = PyUnicode_GET_LENGTH(unicode);
12616 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 /* Compute length of output, quote characters, and
12619 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012620 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 max = 127;
12622 squote = dquote = 0;
12623 ikind = PyUnicode_KIND(unicode);
12624 for (i = 0; i < isize; i++) {
12625 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012626 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012628 case '\'': squote++; break;
12629 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012631 incr = 2;
12632 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 default:
12634 /* Fast-path ASCII */
12635 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012636 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012638 ;
12639 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012642 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012644 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012646 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012648 if (osize > PY_SSIZE_T_MAX - incr) {
12649 PyErr_SetString(PyExc_OverflowError,
12650 "string is too long to generate repr");
12651 return NULL;
12652 }
12653 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 }
12655
12656 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012657 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012659 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 if (dquote)
12661 /* Both squote and dquote present. Use squote,
12662 and escape them */
12663 osize += squote;
12664 else
12665 quote = '"';
12666 }
Victor Stinner55c08782013-04-14 18:45:39 +020012667 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668
12669 repr = PyUnicode_New(osize, max);
12670 if (repr == NULL)
12671 return NULL;
12672 okind = PyUnicode_KIND(repr);
12673 odata = PyUnicode_DATA(repr);
12674
12675 PyUnicode_WRITE(okind, odata, 0, quote);
12676 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012677 if (unchanged) {
12678 _PyUnicode_FastCopyCharacters(repr, 1,
12679 unicode, 0,
12680 isize);
12681 }
12682 else {
12683 for (i = 0, o = 1; i < isize; i++) {
12684 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685
Victor Stinner55c08782013-04-14 18:45:39 +020012686 /* Escape quotes and backslashes */
12687 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012688 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012690 continue;
12691 }
12692
12693 /* Map special whitespace to '\t', \n', '\r' */
12694 if (ch == '\t') {
12695 PyUnicode_WRITE(okind, odata, o++, '\\');
12696 PyUnicode_WRITE(okind, odata, o++, 't');
12697 }
12698 else if (ch == '\n') {
12699 PyUnicode_WRITE(okind, odata, o++, '\\');
12700 PyUnicode_WRITE(okind, odata, o++, 'n');
12701 }
12702 else if (ch == '\r') {
12703 PyUnicode_WRITE(okind, odata, o++, '\\');
12704 PyUnicode_WRITE(okind, odata, o++, 'r');
12705 }
12706
12707 /* Map non-printable US ASCII to '\xhh' */
12708 else if (ch < ' ' || ch == 0x7F) {
12709 PyUnicode_WRITE(okind, odata, o++, '\\');
12710 PyUnicode_WRITE(okind, odata, o++, 'x');
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12712 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12713 }
12714
12715 /* Copy ASCII characters as-is */
12716 else if (ch < 0x7F) {
12717 PyUnicode_WRITE(okind, odata, o++, ch);
12718 }
12719
12720 /* Non-ASCII characters */
12721 else {
12722 /* Map Unicode whitespace and control characters
12723 (categories Z* and C* except ASCII space)
12724 */
12725 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12726 PyUnicode_WRITE(okind, odata, o++, '\\');
12727 /* Map 8-bit characters to '\xhh' */
12728 if (ch <= 0xff) {
12729 PyUnicode_WRITE(okind, odata, o++, 'x');
12730 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12731 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12732 }
12733 /* Map 16-bit characters to '\uxxxx' */
12734 else if (ch <= 0xffff) {
12735 PyUnicode_WRITE(okind, odata, o++, 'u');
12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12737 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12738 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12739 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12740 }
12741 /* Map 21-bit characters to '\U00xxxxxx' */
12742 else {
12743 PyUnicode_WRITE(okind, odata, o++, 'U');
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12752 }
12753 }
12754 /* Copy characters as-is */
12755 else {
12756 PyUnicode_WRITE(okind, odata, o++, ch);
12757 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012758 }
12759 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012762 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012763 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764}
12765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012766PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768\n\
12769Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012770such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771arguments start and end are interpreted as in slice notation.\n\
12772\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012773Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774
12775static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012778 /* initialize variables to prevent gcc warning */
12779 PyObject *substring = NULL;
12780 Py_ssize_t start = 0;
12781 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012784 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012787 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012790 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 if (result == -2)
12793 return NULL;
12794
Christian Heimes217cfd12007-12-02 14:31:20 +000012795 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796}
12797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012798PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012801Return the highest index in S where substring sub is found,\n\
12802such that sub is contained within S[start:end]. Optional\n\
12803arguments start and end are interpreted as in slice notation.\n\
12804\n\
12805Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
12807static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012810 /* initialize variables to prevent gcc warning */
12811 PyObject *substring = NULL;
12812 Py_ssize_t start = 0;
12813 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012816 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012819 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 if (result == -2)
12825 return NULL;
12826
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827 if (result < 0) {
12828 PyErr_SetString(PyExc_ValueError, "substring not found");
12829 return NULL;
12830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831
Christian Heimes217cfd12007-12-02 14:31:20 +000012832 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833}
12834
INADA Naoki3ae20562017-01-16 20:41:20 +090012835/*[clinic input]
12836str.rjust as unicode_rjust
12837
12838 width: Py_ssize_t
12839 fillchar: Py_UCS4 = ' '
12840 /
12841
12842Return a right-justified string of length width.
12843
12844Padding is done using the specified fill character (default is a space).
12845[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846
12847static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012848unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12849/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012851 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852 return NULL;
12853
Victor Stinnerc4b49542011-12-11 22:44:26 +010012854 if (PyUnicode_GET_LENGTH(self) >= width)
12855 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856
Victor Stinnerc4b49542011-12-11 22:44:26 +010012857 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858}
12859
Alexander Belopolsky40018472011-02-26 01:02:56 +000012860PyObject *
12861PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012863 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012866 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867}
12868
INADA Naoki3ae20562017-01-16 20:41:20 +090012869/*[clinic input]
12870str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871
INADA Naoki3ae20562017-01-16 20:41:20 +090012872 sep: object = None
12873 The delimiter according which to split the string.
12874 None (the default value) means split according to any whitespace,
12875 and discard empty strings from the result.
12876 maxsplit: Py_ssize_t = -1
12877 Maximum number of splits to do.
12878 -1 (the default value) means no limit.
12879
12880Return a list of the words in the string, using sep as the delimiter string.
12881[clinic start generated code]*/
12882
12883static PyObject *
12884unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12885/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
INADA Naoki3ae20562017-01-16 20:41:20 +090012887 if (sep == Py_None)
12888 return split(self, NULL, maxsplit);
12889 if (PyUnicode_Check(sep))
12890 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012891
12892 PyErr_Format(PyExc_TypeError,
12893 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012894 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896}
12897
Thomas Wouters477c8d52006-05-27 19:21:47 +000012898PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012899PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012902 int kind1, kind2;
12903 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012905
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012906 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908
Victor Stinner14f8f022011-10-05 20:58:25 +020012909 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 len1 = PyUnicode_GET_LENGTH(str_obj);
12912 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012913 if (kind1 < kind2 || len1 < len2) {
12914 _Py_INCREF_UNICODE_EMPTY();
12915 if (!unicode_empty)
12916 out = NULL;
12917 else {
12918 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12919 Py_DECREF(unicode_empty);
12920 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012921 return out;
12922 }
12923 buf1 = PyUnicode_DATA(str_obj);
12924 buf2 = PyUnicode_DATA(sep_obj);
12925 if (kind2 != kind1) {
12926 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12927 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012931 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012933 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12934 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12935 else
12936 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 break;
12938 case PyUnicode_2BYTE_KIND:
12939 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12940 break;
12941 case PyUnicode_4BYTE_KIND:
12942 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943 break;
12944 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012945 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012947
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012950
12951 return out;
12952}
12953
12954
12955PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012956PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012958 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012959 int kind1, kind2;
12960 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012962
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012963 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012964 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012965
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012966 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 len1 = PyUnicode_GET_LENGTH(str_obj);
12969 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012970 if (kind1 < kind2 || len1 < len2) {
12971 _Py_INCREF_UNICODE_EMPTY();
12972 if (!unicode_empty)
12973 out = NULL;
12974 else {
12975 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12976 Py_DECREF(unicode_empty);
12977 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012978 return out;
12979 }
12980 buf1 = PyUnicode_DATA(str_obj);
12981 buf2 = PyUnicode_DATA(sep_obj);
12982 if (kind2 != kind1) {
12983 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12984 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012985 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012988 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012990 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12991 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12992 else
12993 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 break;
12995 case PyUnicode_2BYTE_KIND:
12996 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12997 break;
12998 case PyUnicode_4BYTE_KIND:
12999 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13000 break;
13001 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013002 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013004
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013005 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013007
13008 return out;
13009}
13010
INADA Naoki3ae20562017-01-16 20:41:20 +090013011/*[clinic input]
13012str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013
INADA Naoki3ae20562017-01-16 20:41:20 +090013014 sep: object
13015 /
13016
13017Partition the string into three parts using the given separator.
13018
13019This will search for the separator in the string. If the separator is found,
13020returns a 3-tuple containing the part before the separator, the separator
13021itself, and the part after it.
13022
13023If the separator is not found, returns a 3-tuple containing the original string
13024and two empty strings.
13025[clinic start generated code]*/
13026
13027static PyObject *
13028unicode_partition(PyObject *self, PyObject *sep)
13029/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013030{
INADA Naoki3ae20562017-01-16 20:41:20 +090013031 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013032}
13033
INADA Naoki3ae20562017-01-16 20:41:20 +090013034/*[clinic input]
13035str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013036
INADA Naoki3ae20562017-01-16 20:41:20 +090013037Partition the string into three parts using the given separator.
13038
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013039This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013040the separator is found, returns a 3-tuple containing the part before the
13041separator, the separator itself, and the part after it.
13042
13043If the separator is not found, returns a 3-tuple containing two empty strings
13044and the original string.
13045[clinic start generated code]*/
13046
13047static PyObject *
13048unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013049/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013050{
INADA Naoki3ae20562017-01-16 20:41:20 +090013051 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052}
13053
Alexander Belopolsky40018472011-02-26 01:02:56 +000013054PyObject *
13055PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013056{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013057 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013058 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013059
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013060 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013061}
13062
INADA Naoki3ae20562017-01-16 20:41:20 +090013063/*[clinic input]
13064str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013065
INADA Naoki3ae20562017-01-16 20:41:20 +090013066Return a list of the words in the string, using sep as the delimiter string.
13067
13068Splits are done starting at the end of the string and working to the front.
13069[clinic start generated code]*/
13070
13071static PyObject *
13072unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13073/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013074{
INADA Naoki3ae20562017-01-16 20:41:20 +090013075 if (sep == Py_None)
13076 return rsplit(self, NULL, maxsplit);
13077 if (PyUnicode_Check(sep))
13078 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013079
13080 PyErr_Format(PyExc_TypeError,
13081 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013082 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013083 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013084}
13085
INADA Naoki3ae20562017-01-16 20:41:20 +090013086/*[clinic input]
13087str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013089 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013090
13091Return a list of the lines in the string, breaking at line boundaries.
13092
13093Line breaks are not included in the resulting list unless keepends is given and
13094true.
13095[clinic start generated code]*/
13096
13097static PyObject *
13098unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013099/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013101 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102}
13103
13104static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013105PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013107 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108}
13109
INADA Naoki3ae20562017-01-16 20:41:20 +090013110/*[clinic input]
13111str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112
INADA Naoki3ae20562017-01-16 20:41:20 +090013113Convert uppercase characters to lowercase and lowercase characters to uppercase.
13114[clinic start generated code]*/
13115
13116static PyObject *
13117unicode_swapcase_impl(PyObject *self)
13118/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013120 if (PyUnicode_READY(self) == -1)
13121 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013122 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123}
13124
Larry Hastings61272b72014-01-07 12:41:53 -080013125/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013126
Larry Hastings31826802013-10-19 00:09:25 -070013127@staticmethod
13128str.maketrans as unicode_maketrans
13129
13130 x: object
13131
13132 y: unicode=NULL
13133
13134 z: unicode=NULL
13135
13136 /
13137
13138Return a translation table usable for str.translate().
13139
13140If there is only one argument, it must be a dictionary mapping Unicode
13141ordinals (integers) or characters to Unicode ordinals, strings or None.
13142Character keys will be then converted to ordinals.
13143If there are two arguments, they must be strings of equal length, and
13144in the resulting dictionary, each character in x will be mapped to the
13145character at the same position in y. If there is a third argument, it
13146must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013147[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013148
Larry Hastings31826802013-10-19 00:09:25 -070013149static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013150unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013151/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013152{
Georg Brandlceee0772007-11-27 23:48:05 +000013153 PyObject *new = NULL, *key, *value;
13154 Py_ssize_t i = 0;
13155 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013156
Georg Brandlceee0772007-11-27 23:48:05 +000013157 new = PyDict_New();
13158 if (!new)
13159 return NULL;
13160 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 int x_kind, y_kind, z_kind;
13162 void *x_data, *y_data, *z_data;
13163
Georg Brandlceee0772007-11-27 23:48:05 +000013164 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013165 if (!PyUnicode_Check(x)) {
13166 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13167 "be a string if there is a second argument");
13168 goto err;
13169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013171 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13172 "arguments must have equal length");
13173 goto err;
13174 }
13175 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176 x_kind = PyUnicode_KIND(x);
13177 y_kind = PyUnicode_KIND(y);
13178 x_data = PyUnicode_DATA(x);
13179 y_data = PyUnicode_DATA(y);
13180 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13181 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013182 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013183 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013184 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013185 if (!value) {
13186 Py_DECREF(key);
13187 goto err;
13188 }
Georg Brandlceee0772007-11-27 23:48:05 +000013189 res = PyDict_SetItem(new, key, value);
13190 Py_DECREF(key);
13191 Py_DECREF(value);
13192 if (res < 0)
13193 goto err;
13194 }
13195 /* create entries for deleting chars in z */
13196 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 z_kind = PyUnicode_KIND(z);
13198 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013199 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013201 if (!key)
13202 goto err;
13203 res = PyDict_SetItem(new, key, Py_None);
13204 Py_DECREF(key);
13205 if (res < 0)
13206 goto err;
13207 }
13208 }
13209 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 int kind;
13211 void *data;
13212
Georg Brandlceee0772007-11-27 23:48:05 +000013213 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013214 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013215 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13216 "to maketrans it must be a dict");
13217 goto err;
13218 }
13219 /* copy entries into the new dict, converting string keys to int keys */
13220 while (PyDict_Next(x, &i, &key, &value)) {
13221 if (PyUnicode_Check(key)) {
13222 /* convert string keys to integer keys */
13223 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013224 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013225 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13226 "table must be of length 1");
13227 goto err;
13228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 kind = PyUnicode_KIND(key);
13230 data = PyUnicode_DATA(key);
13231 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013232 if (!newkey)
13233 goto err;
13234 res = PyDict_SetItem(new, newkey, value);
13235 Py_DECREF(newkey);
13236 if (res < 0)
13237 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013238 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013239 /* just keep integer keys */
13240 if (PyDict_SetItem(new, key, value) < 0)
13241 goto err;
13242 } else {
13243 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13244 "be strings or integers");
13245 goto err;
13246 }
13247 }
13248 }
13249 return new;
13250 err:
13251 Py_DECREF(new);
13252 return NULL;
13253}
13254
INADA Naoki3ae20562017-01-16 20:41:20 +090013255/*[clinic input]
13256str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257
INADA Naoki3ae20562017-01-16 20:41:20 +090013258 table: object
13259 Translation table, which must be a mapping of Unicode ordinals to
13260 Unicode ordinals, strings, or None.
13261 /
13262
13263Replace each character in the string using the given translation table.
13264
13265The table must implement lookup/indexing via __getitem__, for instance a
13266dictionary or list. If this operation raises LookupError, the character is
13267left untouched. Characters mapped to None are deleted.
13268[clinic start generated code]*/
13269
13270static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013272/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013275}
13276
INADA Naoki3ae20562017-01-16 20:41:20 +090013277/*[clinic input]
13278str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280Return a copy of the string converted to uppercase.
13281[clinic start generated code]*/
13282
13283static PyObject *
13284unicode_upper_impl(PyObject *self)
13285/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013287 if (PyUnicode_READY(self) == -1)
13288 return NULL;
13289 if (PyUnicode_IS_ASCII(self))
13290 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013291 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292}
13293
INADA Naoki3ae20562017-01-16 20:41:20 +090013294/*[clinic input]
13295str.zfill as unicode_zfill
13296
13297 width: Py_ssize_t
13298 /
13299
13300Pad a numeric string with zeros on the left, to fill a field of the given width.
13301
13302The string is never truncated.
13303[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304
13305static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013306unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013307/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013309 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013310 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 int kind;
13312 void *data;
13313 Py_UCS4 chr;
13314
Benjamin Petersonbac79492012-01-14 13:34:47 -050013315 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317
Victor Stinnerc4b49542011-12-11 22:44:26 +010013318 if (PyUnicode_GET_LENGTH(self) >= width)
13319 return unicode_result_unchanged(self);
13320
13321 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322
13323 u = pad(self, fill, 0, '0');
13324
Walter Dörwald068325e2002-04-15 13:36:47 +000013325 if (u == NULL)
13326 return NULL;
13327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 kind = PyUnicode_KIND(u);
13329 data = PyUnicode_DATA(u);
13330 chr = PyUnicode_READ(kind, data, fill);
13331
13332 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 PyUnicode_WRITE(kind, data, 0, chr);
13335 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336 }
13337
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013338 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013339 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341
13342#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013343static PyObject *
13344unicode__decimal2ascii(PyObject *self)
13345{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013347}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348#endif
13349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013350PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013353Return True if S starts with the specified prefix, False otherwise.\n\
13354With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013355With optional end, stop comparing S at that position.\n\
13356prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357
13358static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013359unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013360 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013362 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013363 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013364 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013365 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013366 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367
Jesus Ceaac451502011-04-20 17:09:23 +020013368 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013369 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013370 if (PyTuple_Check(subobj)) {
13371 Py_ssize_t i;
13372 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013373 substring = PyTuple_GET_ITEM(subobj, i);
13374 if (!PyUnicode_Check(substring)) {
13375 PyErr_Format(PyExc_TypeError,
13376 "tuple for startswith must only contain str, "
13377 "not %.100s",
13378 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013380 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013381 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013382 if (result == -1)
13383 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 if (result) {
13385 Py_RETURN_TRUE;
13386 }
13387 }
13388 /* nothing matched */
13389 Py_RETURN_FALSE;
13390 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013391 if (!PyUnicode_Check(subobj)) {
13392 PyErr_Format(PyExc_TypeError,
13393 "startswith first arg must be str or "
13394 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013396 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013397 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013398 if (result == -1)
13399 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401}
13402
13403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013404PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013407Return True if S ends with the specified suffix, False otherwise.\n\
13408With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013409With optional end, stop comparing S at that position.\n\
13410suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411
13412static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013413unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013416 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013417 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013418 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013419 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013420 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421
Jesus Ceaac451502011-04-20 17:09:23 +020013422 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013424 if (PyTuple_Check(subobj)) {
13425 Py_ssize_t i;
13426 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013427 substring = PyTuple_GET_ITEM(subobj, i);
13428 if (!PyUnicode_Check(substring)) {
13429 PyErr_Format(PyExc_TypeError,
13430 "tuple for endswith must only contain str, "
13431 "not %.100s",
13432 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013434 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013435 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013436 if (result == -1)
13437 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013438 if (result) {
13439 Py_RETURN_TRUE;
13440 }
13441 }
13442 Py_RETURN_FALSE;
13443 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013444 if (!PyUnicode_Check(subobj)) {
13445 PyErr_Format(PyExc_TypeError,
13446 "endswith first arg must be str or "
13447 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013449 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013450 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013451 if (result == -1)
13452 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013453 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454}
13455
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013456static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013457_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013458{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013459 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13460 writer->data = PyUnicode_DATA(writer->buffer);
13461
13462 if (!writer->readonly) {
13463 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013464 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013465 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013466 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013467 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13468 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13469 writer->kind = PyUnicode_WCHAR_KIND;
13470 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13471
Victor Stinner8f674cc2013-04-17 23:02:17 +020013472 /* Copy-on-write mode: set buffer size to 0 so
13473 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13474 * next write. */
13475 writer->size = 0;
13476 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013477}
13478
Victor Stinnerd3f08822012-05-29 12:57:52 +020013479void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013480_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013481{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013482 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013483
13484 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013485 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013486
13487 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13488 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13489 writer->kind = PyUnicode_WCHAR_KIND;
13490 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013491}
13492
Victor Stinnerd3f08822012-05-29 12:57:52 +020013493int
13494_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13495 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013496{
13497 Py_ssize_t newlen;
13498 PyObject *newbuffer;
13499
Victor Stinner2740e462016-09-06 16:58:36 -070013500 assert(maxchar <= MAX_UNICODE);
13501
Victor Stinnerca9381e2015-09-22 00:58:32 +020013502 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013503 assert((maxchar > writer->maxchar && length >= 0)
13504 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013505
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 if (length > PY_SSIZE_T_MAX - writer->pos) {
13507 PyErr_NoMemory();
13508 return -1;
13509 }
13510 newlen = writer->pos + length;
13511
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013512 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013513
Victor Stinnerd3f08822012-05-29 12:57:52 +020013514 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013515 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013516 if (writer->overallocate
13517 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13518 /* overallocate to limit the number of realloc() */
13519 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013520 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013521 if (newlen < writer->min_length)
13522 newlen = writer->min_length;
13523
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524 writer->buffer = PyUnicode_New(newlen, maxchar);
13525 if (writer->buffer == NULL)
13526 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013528 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013529 if (writer->overallocate
13530 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13531 /* overallocate to limit the number of realloc() */
13532 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013533 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013534 if (newlen < writer->min_length)
13535 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013537 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013538 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013539 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013540 newbuffer = PyUnicode_New(newlen, maxchar);
13541 if (newbuffer == NULL)
13542 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13544 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013545 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013546 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013547 }
13548 else {
13549 newbuffer = resize_compact(writer->buffer, newlen);
13550 if (newbuffer == NULL)
13551 return -1;
13552 }
13553 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013554 }
13555 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013556 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 newbuffer = PyUnicode_New(writer->size, maxchar);
13558 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013559 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13561 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013562 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013563 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013564 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013565 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013566
13567#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013568}
13569
Victor Stinnerca9381e2015-09-22 00:58:32 +020013570int
13571_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13572 enum PyUnicode_Kind kind)
13573{
13574 Py_UCS4 maxchar;
13575
13576 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13577 assert(writer->kind < kind);
13578
13579 switch (kind)
13580 {
13581 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13582 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13583 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13584 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013585 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013586 }
13587
13588 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13589}
13590
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013591static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013592_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013593{
Victor Stinner2740e462016-09-06 16:58:36 -070013594 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013595 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13596 return -1;
13597 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13598 writer->pos++;
13599 return 0;
13600}
13601
13602int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013603_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13604{
13605 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13606}
13607
13608int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013609_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13610{
13611 Py_UCS4 maxchar;
13612 Py_ssize_t len;
13613
13614 if (PyUnicode_READY(str) == -1)
13615 return -1;
13616 len = PyUnicode_GET_LENGTH(str);
13617 if (len == 0)
13618 return 0;
13619 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13620 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013621 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013622 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013623 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013624 Py_INCREF(str);
13625 writer->buffer = str;
13626 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013627 writer->pos += len;
13628 return 0;
13629 }
13630 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13631 return -1;
13632 }
13633 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13634 str, 0, len);
13635 writer->pos += len;
13636 return 0;
13637}
13638
Victor Stinnere215d962012-10-06 23:03:36 +020013639int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013640_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13641 Py_ssize_t start, Py_ssize_t end)
13642{
13643 Py_UCS4 maxchar;
13644 Py_ssize_t len;
13645
13646 if (PyUnicode_READY(str) == -1)
13647 return -1;
13648
13649 assert(0 <= start);
13650 assert(end <= PyUnicode_GET_LENGTH(str));
13651 assert(start <= end);
13652
13653 if (end == 0)
13654 return 0;
13655
13656 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13657 return _PyUnicodeWriter_WriteStr(writer, str);
13658
13659 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13660 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13661 else
13662 maxchar = writer->maxchar;
13663 len = end - start;
13664
13665 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13666 return -1;
13667
13668 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13669 str, start, len);
13670 writer->pos += len;
13671 return 0;
13672}
13673
13674int
Victor Stinner4a587072013-11-19 12:54:53 +010013675_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13676 const char *ascii, Py_ssize_t len)
13677{
13678 if (len == -1)
13679 len = strlen(ascii);
13680
13681 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13682
13683 if (writer->buffer == NULL && !writer->overallocate) {
13684 PyObject *str;
13685
13686 str = _PyUnicode_FromASCII(ascii, len);
13687 if (str == NULL)
13688 return -1;
13689
13690 writer->readonly = 1;
13691 writer->buffer = str;
13692 _PyUnicodeWriter_Update(writer);
13693 writer->pos += len;
13694 return 0;
13695 }
13696
13697 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13698 return -1;
13699
13700 switch (writer->kind)
13701 {
13702 case PyUnicode_1BYTE_KIND:
13703 {
13704 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13705 Py_UCS1 *data = writer->data;
13706
Christian Heimesf051e432016-09-13 20:22:02 +020013707 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013708 break;
13709 }
13710 case PyUnicode_2BYTE_KIND:
13711 {
13712 _PyUnicode_CONVERT_BYTES(
13713 Py_UCS1, Py_UCS2,
13714 ascii, ascii + len,
13715 (Py_UCS2 *)writer->data + writer->pos);
13716 break;
13717 }
13718 case PyUnicode_4BYTE_KIND:
13719 {
13720 _PyUnicode_CONVERT_BYTES(
13721 Py_UCS1, Py_UCS4,
13722 ascii, ascii + len,
13723 (Py_UCS4 *)writer->data + writer->pos);
13724 break;
13725 }
13726 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013727 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013728 }
13729
13730 writer->pos += len;
13731 return 0;
13732}
13733
13734int
13735_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13736 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013737{
13738 Py_UCS4 maxchar;
13739
13740 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13741 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13742 return -1;
13743 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13744 writer->pos += len;
13745 return 0;
13746}
13747
Victor Stinnerd3f08822012-05-29 12:57:52 +020013748PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013749_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013750{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013751 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013752
Victor Stinnerd3f08822012-05-29 12:57:52 +020013753 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013754 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013755 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013756 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013757
13758 str = writer->buffer;
13759 writer->buffer = NULL;
13760
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013761 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013762 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13763 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013764 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013765
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013766 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13767 PyObject *str2;
13768 str2 = resize_compact(str, writer->pos);
13769 if (str2 == NULL) {
13770 Py_DECREF(str);
13771 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013772 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013773 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013774 }
13775
Victor Stinner15a0bd32013-07-08 22:29:55 +020013776 assert(_PyUnicode_CheckConsistency(str, 1));
13777 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013778}
13779
Victor Stinnerd3f08822012-05-29 12:57:52 +020013780void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013781_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013782{
13783 Py_CLEAR(writer->buffer);
13784}
13785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013786#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013787
13788PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013789 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013790\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013791Return a formatted version of S, using substitutions from args and kwargs.\n\
13792The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013793
Eric Smith27bbca62010-11-04 17:06:58 +000013794PyDoc_STRVAR(format_map__doc__,
13795 "S.format_map(mapping) -> str\n\
13796\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013797Return a formatted version of S, using substitutions from mapping.\n\
13798The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013799
INADA Naoki3ae20562017-01-16 20:41:20 +090013800/*[clinic input]
13801str.__format__ as unicode___format__
13802
13803 format_spec: unicode
13804 /
13805
13806Return a formatted version of the string as described by format_spec.
13807[clinic start generated code]*/
13808
Eric Smith4a7d76d2008-05-30 18:10:19 +000013809static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013810unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013811/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013812{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013813 _PyUnicodeWriter writer;
13814 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013815
Victor Stinnerd3f08822012-05-29 12:57:52 +020013816 if (PyUnicode_READY(self) == -1)
13817 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013818 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013819 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13820 self, format_spec, 0,
13821 PyUnicode_GET_LENGTH(format_spec));
13822 if (ret == -1) {
13823 _PyUnicodeWriter_Dealloc(&writer);
13824 return NULL;
13825 }
13826 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013827}
13828
INADA Naoki3ae20562017-01-16 20:41:20 +090013829/*[clinic input]
13830str.__sizeof__ as unicode_sizeof
13831
13832Return the size of the string in memory, in bytes.
13833[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013834
13835static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013836unicode_sizeof_impl(PyObject *self)
13837/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013839 Py_ssize_t size;
13840
13841 /* If it's a compact object, account for base structure +
13842 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013843 if (PyUnicode_IS_COMPACT_ASCII(self))
13844 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13845 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013846 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013847 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013848 else {
13849 /* If it is a two-block object, account for base object, and
13850 for character block if present. */
13851 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013852 if (_PyUnicode_DATA_ANY(self))
13853 size += (PyUnicode_GET_LENGTH(self) + 1) *
13854 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013855 }
13856 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013857 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013858 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13859 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13860 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13861 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013862
13863 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013864}
13865
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013866static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013867unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013868{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013869 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013870 if (!copy)
13871 return NULL;
13872 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013873}
13874
Guido van Rossumd57fd912000-03-10 22:53:23 +000013875static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013876 UNICODE_ENCODE_METHODDEF
13877 UNICODE_REPLACE_METHODDEF
13878 UNICODE_SPLIT_METHODDEF
13879 UNICODE_RSPLIT_METHODDEF
13880 UNICODE_JOIN_METHODDEF
13881 UNICODE_CAPITALIZE_METHODDEF
13882 UNICODE_CASEFOLD_METHODDEF
13883 UNICODE_TITLE_METHODDEF
13884 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013885 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013886 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013887 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013888 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013889 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013890 UNICODE_LJUST_METHODDEF
13891 UNICODE_LOWER_METHODDEF
13892 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013893 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13894 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013895 UNICODE_RJUST_METHODDEF
13896 UNICODE_RSTRIP_METHODDEF
13897 UNICODE_RPARTITION_METHODDEF
13898 UNICODE_SPLITLINES_METHODDEF
13899 UNICODE_STRIP_METHODDEF
13900 UNICODE_SWAPCASE_METHODDEF
13901 UNICODE_TRANSLATE_METHODDEF
13902 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013903 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13904 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013905 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013906 UNICODE_ISLOWER_METHODDEF
13907 UNICODE_ISUPPER_METHODDEF
13908 UNICODE_ISTITLE_METHODDEF
13909 UNICODE_ISSPACE_METHODDEF
13910 UNICODE_ISDECIMAL_METHODDEF
13911 UNICODE_ISDIGIT_METHODDEF
13912 UNICODE_ISNUMERIC_METHODDEF
13913 UNICODE_ISALPHA_METHODDEF
13914 UNICODE_ISALNUM_METHODDEF
13915 UNICODE_ISIDENTIFIER_METHODDEF
13916 UNICODE_ISPRINTABLE_METHODDEF
13917 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013918 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013919 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013920 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013921 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013922 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013923#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013924 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013925 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926#endif
13927
Benjamin Peterson14339b62009-01-31 16:36:08 +000013928 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929 {NULL, NULL}
13930};
13931
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013932static PyObject *
13933unicode_mod(PyObject *v, PyObject *w)
13934{
Brian Curtindfc80e32011-08-10 20:28:54 -050013935 if (!PyUnicode_Check(v))
13936 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013938}
13939
13940static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013941 0, /*nb_add*/
13942 0, /*nb_subtract*/
13943 0, /*nb_multiply*/
13944 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013945};
13946
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013948 (lenfunc) unicode_length, /* sq_length */
13949 PyUnicode_Concat, /* sq_concat */
13950 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13951 (ssizeargfunc) unicode_getitem, /* sq_item */
13952 0, /* sq_slice */
13953 0, /* sq_ass_item */
13954 0, /* sq_ass_slice */
13955 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956};
13957
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013958static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013959unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013961 if (PyUnicode_READY(self) == -1)
13962 return NULL;
13963
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013964 if (PyIndex_Check(item)) {
13965 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013966 if (i == -1 && PyErr_Occurred())
13967 return NULL;
13968 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013969 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013970 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013971 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013972 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013973 PyObject *result;
13974 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013975 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013976 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013977
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013978 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013979 return NULL;
13980 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013981 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13982 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983
13984 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013985 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013986 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013987 slicelength == PyUnicode_GET_LENGTH(self)) {
13988 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013989 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013990 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013991 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013992 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013993 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013994 src_kind = PyUnicode_KIND(self);
13995 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013996 if (!PyUnicode_IS_ASCII(self)) {
13997 kind_limit = kind_maxchar_limit(src_kind);
13998 max_char = 0;
13999 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14000 ch = PyUnicode_READ(src_kind, src_data, cur);
14001 if (ch > max_char) {
14002 max_char = ch;
14003 if (max_char >= kind_limit)
14004 break;
14005 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014006 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014007 }
Victor Stinner55c99112011-10-13 01:17:06 +020014008 else
14009 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014010 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014011 if (result == NULL)
14012 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014013 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014014 dest_data = PyUnicode_DATA(result);
14015
14016 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014017 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14018 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014019 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014020 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014021 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014022 } else {
14023 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14024 return NULL;
14025 }
14026}
14027
14028static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 (lenfunc)unicode_length, /* mp_length */
14030 (binaryfunc)unicode_subscript, /* mp_subscript */
14031 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014032};
14033
Guido van Rossumd57fd912000-03-10 22:53:23 +000014034
Guido van Rossumd57fd912000-03-10 22:53:23 +000014035/* Helpers for PyUnicode_Format() */
14036
Victor Stinnera47082312012-10-04 02:19:54 +020014037struct unicode_formatter_t {
14038 PyObject *args;
14039 int args_owned;
14040 Py_ssize_t arglen, argidx;
14041 PyObject *dict;
14042
14043 enum PyUnicode_Kind fmtkind;
14044 Py_ssize_t fmtcnt, fmtpos;
14045 void *fmtdata;
14046 PyObject *fmtstr;
14047
14048 _PyUnicodeWriter writer;
14049};
14050
14051struct unicode_format_arg_t {
14052 Py_UCS4 ch;
14053 int flags;
14054 Py_ssize_t width;
14055 int prec;
14056 int sign;
14057};
14058
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014060unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014061{
Victor Stinnera47082312012-10-04 02:19:54 +020014062 Py_ssize_t argidx = ctx->argidx;
14063
14064 if (argidx < ctx->arglen) {
14065 ctx->argidx++;
14066 if (ctx->arglen < 0)
14067 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014068 else
Victor Stinnera47082312012-10-04 02:19:54 +020014069 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014070 }
14071 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073 return NULL;
14074}
14075
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014076/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077
Victor Stinnera47082312012-10-04 02:19:54 +020014078/* Format a float into the writer if the writer is not NULL, or into *p_output
14079 otherwise.
14080
14081 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014082static int
Victor Stinnera47082312012-10-04 02:19:54 +020014083formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14084 PyObject **p_output,
14085 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014086{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014087 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014088 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014090 int prec;
14091 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014092
Guido van Rossumd57fd912000-03-10 22:53:23 +000014093 x = PyFloat_AsDouble(v);
14094 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014095 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014096
Victor Stinnera47082312012-10-04 02:19:54 +020014097 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014100
Victor Stinnera47082312012-10-04 02:19:54 +020014101 if (arg->flags & F_ALT)
14102 dtoa_flags = Py_DTSF_ALT;
14103 else
14104 dtoa_flags = 0;
14105 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014106 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014107 return -1;
14108 len = strlen(p);
14109 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014110 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014111 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014113 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014114 }
14115 else
14116 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014117 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119}
14120
Victor Stinnerd0880d52012-04-27 23:40:13 +020014121/* formatlong() emulates the format codes d, u, o, x and X, and
14122 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14123 * Python's regular ints.
14124 * Return value: a new PyUnicodeObject*, or NULL if error.
14125 * The output string is of the form
14126 * "-"? ("0x" | "0X")? digit+
14127 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14128 * set in flags. The case of hex digits will be correct,
14129 * There will be at least prec digits, zero-filled on the left if
14130 * necessary to get that many.
14131 * val object to be converted
14132 * flags bitmask of format flags; only F_ALT is looked at
14133 * prec minimum number of digits; 0-fill on left if needed
14134 * type a character in [duoxX]; u acts the same as d
14135 *
14136 * CAUTION: o, x and X conversions on regular ints can never
14137 * produce a '-' sign, but can for Python's unbounded ints.
14138 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014139PyObject *
14140_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014141{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014142 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014144 Py_ssize_t i;
14145 int sign; /* 1 if '-', else 0 */
14146 int len; /* number of characters */
14147 Py_ssize_t llen;
14148 int numdigits; /* len == numnondigits + numdigits */
14149 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014150
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151 /* Avoid exceeding SSIZE_T_MAX */
14152 if (prec > INT_MAX-3) {
14153 PyErr_SetString(PyExc_OverflowError,
14154 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014156 }
14157
14158 assert(PyLong_Check(val));
14159
14160 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014161 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014162 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014163 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014164 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014165 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014166 /* int and int subclasses should print numerically when a numeric */
14167 /* format code is used (see issue18780) */
14168 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014169 break;
14170 case 'o':
14171 numnondigits = 2;
14172 result = PyNumber_ToBase(val, 8);
14173 break;
14174 case 'x':
14175 case 'X':
14176 numnondigits = 2;
14177 result = PyNumber_ToBase(val, 16);
14178 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014179 }
14180 if (!result)
14181 return NULL;
14182
14183 assert(unicode_modifiable(result));
14184 assert(PyUnicode_IS_READY(result));
14185 assert(PyUnicode_IS_ASCII(result));
14186
14187 /* To modify the string in-place, there can only be one reference. */
14188 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014189 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 PyErr_BadInternalCall();
14191 return NULL;
14192 }
14193 buf = PyUnicode_DATA(result);
14194 llen = PyUnicode_GET_LENGTH(result);
14195 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014196 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014197 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014198 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 return NULL;
14200 }
14201 len = (int)llen;
14202 sign = buf[0] == '-';
14203 numnondigits += sign;
14204 numdigits = len - numnondigits;
14205 assert(numdigits > 0);
14206
14207 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014208 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014209 (type == 'o' || type == 'x' || type == 'X'))) {
14210 assert(buf[sign] == '0');
14211 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14212 buf[sign+1] == 'o');
14213 numnondigits -= 2;
14214 buf += 2;
14215 len -= 2;
14216 if (sign)
14217 buf[0] = '-';
14218 assert(len == numnondigits + numdigits);
14219 assert(numdigits > 0);
14220 }
14221
14222 /* Fill with leading zeroes to meet minimum width. */
14223 if (prec > numdigits) {
14224 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14225 numnondigits + prec);
14226 char *b1;
14227 if (!r1) {
14228 Py_DECREF(result);
14229 return NULL;
14230 }
14231 b1 = PyBytes_AS_STRING(r1);
14232 for (i = 0; i < numnondigits; ++i)
14233 *b1++ = *buf++;
14234 for (i = 0; i < prec - numdigits; i++)
14235 *b1++ = '0';
14236 for (i = 0; i < numdigits; i++)
14237 *b1++ = *buf++;
14238 *b1 = '\0';
14239 Py_DECREF(result);
14240 result = r1;
14241 buf = PyBytes_AS_STRING(result);
14242 len = numnondigits + prec;
14243 }
14244
14245 /* Fix up case for hex conversions. */
14246 if (type == 'X') {
14247 /* Need to convert all lower case letters to upper case.
14248 and need to convert 0x to 0X (and -0x to -0X). */
14249 for (i = 0; i < len; i++)
14250 if (buf[i] >= 'a' && buf[i] <= 'x')
14251 buf[i] -= 'a'-'A';
14252 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014253 if (!PyUnicode_Check(result)
14254 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014255 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014256 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014257 Py_DECREF(result);
14258 result = unicode;
14259 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014260 else if (len != PyUnicode_GET_LENGTH(result)) {
14261 if (PyUnicode_Resize(&result, len) < 0)
14262 Py_CLEAR(result);
14263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014264 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014265}
14266
Ethan Furmandf3ed242014-01-05 06:50:30 -080014267/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014268 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014269 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014270 * -1 and raise an exception on error */
14271static int
Victor Stinnera47082312012-10-04 02:19:54 +020014272mainformatlong(PyObject *v,
14273 struct unicode_format_arg_t *arg,
14274 PyObject **p_output,
14275 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014276{
14277 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014278 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014279
14280 if (!PyNumber_Check(v))
14281 goto wrongtype;
14282
Ethan Furman9ab74802014-03-21 06:38:46 -070014283 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014285 if (type == 'o' || type == 'x' || type == 'X') {
14286 iobj = PyNumber_Index(v);
14287 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014288 if (PyErr_ExceptionMatches(PyExc_TypeError))
14289 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014290 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014291 }
14292 }
14293 else {
14294 iobj = PyNumber_Long(v);
14295 if (iobj == NULL ) {
14296 if (PyErr_ExceptionMatches(PyExc_TypeError))
14297 goto wrongtype;
14298 return -1;
14299 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014300 }
14301 assert(PyLong_Check(iobj));
14302 }
14303 else {
14304 iobj = v;
14305 Py_INCREF(iobj);
14306 }
14307
14308 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014309 && arg->width == -1 && arg->prec == -1
14310 && !(arg->flags & (F_SIGN | F_BLANK))
14311 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014312 {
14313 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014314 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315 int base;
14316
Victor Stinnera47082312012-10-04 02:19:54 +020014317 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014318 {
14319 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014320 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014321 case 'd':
14322 case 'i':
14323 case 'u':
14324 base = 10;
14325 break;
14326 case 'o':
14327 base = 8;
14328 break;
14329 case 'x':
14330 case 'X':
14331 base = 16;
14332 break;
14333 }
14334
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014335 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14336 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014337 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014338 }
14339 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014340 return 1;
14341 }
14342
Ethan Furmanb95b5612015-01-23 20:05:18 -080014343 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014344 Py_DECREF(iobj);
14345 if (res == NULL)
14346 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014347 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 return 0;
14349
14350wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014351 switch(type)
14352 {
14353 case 'o':
14354 case 'x':
14355 case 'X':
14356 PyErr_Format(PyExc_TypeError,
14357 "%%%c format: an integer is required, "
14358 "not %.200s",
14359 type, Py_TYPE(v)->tp_name);
14360 break;
14361 default:
14362 PyErr_Format(PyExc_TypeError,
14363 "%%%c format: a number is required, "
14364 "not %.200s",
14365 type, Py_TYPE(v)->tp_name);
14366 break;
14367 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 return -1;
14369}
14370
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014371static Py_UCS4
14372formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014373{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014374 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014375 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014376 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014377 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014378 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014379 goto onError;
14380 }
14381 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014382 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014383 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014384 /* make sure number is a type of integer */
14385 if (!PyLong_Check(v)) {
14386 iobj = PyNumber_Index(v);
14387 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014388 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014389 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014390 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014391 Py_DECREF(iobj);
14392 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014393 else {
14394 x = PyLong_AsLong(v);
14395 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014396 if (x == -1 && PyErr_Occurred())
14397 goto onError;
14398
Victor Stinner8faf8212011-12-08 22:14:11 +010014399 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 PyErr_SetString(PyExc_OverflowError,
14401 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014402 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 }
14404
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014405 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014406 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014407
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014409 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014410 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014411 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412}
14413
Victor Stinnera47082312012-10-04 02:19:54 +020014414/* Parse options of an argument: flags, width, precision.
14415 Handle also "%(name)" syntax.
14416
14417 Return 0 if the argument has been formatted into arg->str.
14418 Return 1 if the argument has been written into ctx->writer,
14419 Raise an exception and return -1 on error. */
14420static int
14421unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14422 struct unicode_format_arg_t *arg)
14423{
14424#define FORMAT_READ(ctx) \
14425 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14426
14427 PyObject *v;
14428
Victor Stinnera47082312012-10-04 02:19:54 +020014429 if (arg->ch == '(') {
14430 /* Get argument value from a dictionary. Example: "%(name)s". */
14431 Py_ssize_t keystart;
14432 Py_ssize_t keylen;
14433 PyObject *key;
14434 int pcount = 1;
14435
14436 if (ctx->dict == NULL) {
14437 PyErr_SetString(PyExc_TypeError,
14438 "format requires a mapping");
14439 return -1;
14440 }
14441 ++ctx->fmtpos;
14442 --ctx->fmtcnt;
14443 keystart = ctx->fmtpos;
14444 /* Skip over balanced parentheses */
14445 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14446 arg->ch = FORMAT_READ(ctx);
14447 if (arg->ch == ')')
14448 --pcount;
14449 else if (arg->ch == '(')
14450 ++pcount;
14451 ctx->fmtpos++;
14452 }
14453 keylen = ctx->fmtpos - keystart - 1;
14454 if (ctx->fmtcnt < 0 || pcount > 0) {
14455 PyErr_SetString(PyExc_ValueError,
14456 "incomplete format key");
14457 return -1;
14458 }
14459 key = PyUnicode_Substring(ctx->fmtstr,
14460 keystart, keystart + keylen);
14461 if (key == NULL)
14462 return -1;
14463 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014464 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014465 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014466 }
14467 ctx->args = PyObject_GetItem(ctx->dict, key);
14468 Py_DECREF(key);
14469 if (ctx->args == NULL)
14470 return -1;
14471 ctx->args_owned = 1;
14472 ctx->arglen = -1;
14473 ctx->argidx = -2;
14474 }
14475
14476 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014477 while (--ctx->fmtcnt >= 0) {
14478 arg->ch = FORMAT_READ(ctx);
14479 ctx->fmtpos++;
14480 switch (arg->ch) {
14481 case '-': arg->flags |= F_LJUST; continue;
14482 case '+': arg->flags |= F_SIGN; continue;
14483 case ' ': arg->flags |= F_BLANK; continue;
14484 case '#': arg->flags |= F_ALT; continue;
14485 case '0': arg->flags |= F_ZERO; continue;
14486 }
14487 break;
14488 }
14489
14490 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014491 if (arg->ch == '*') {
14492 v = unicode_format_getnextarg(ctx);
14493 if (v == NULL)
14494 return -1;
14495 if (!PyLong_Check(v)) {
14496 PyErr_SetString(PyExc_TypeError,
14497 "* wants int");
14498 return -1;
14499 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014500 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014501 if (arg->width == -1 && PyErr_Occurred())
14502 return -1;
14503 if (arg->width < 0) {
14504 arg->flags |= F_LJUST;
14505 arg->width = -arg->width;
14506 }
14507 if (--ctx->fmtcnt >= 0) {
14508 arg->ch = FORMAT_READ(ctx);
14509 ctx->fmtpos++;
14510 }
14511 }
14512 else if (arg->ch >= '0' && arg->ch <= '9') {
14513 arg->width = arg->ch - '0';
14514 while (--ctx->fmtcnt >= 0) {
14515 arg->ch = FORMAT_READ(ctx);
14516 ctx->fmtpos++;
14517 if (arg->ch < '0' || arg->ch > '9')
14518 break;
14519 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14520 mixing signed and unsigned comparison. Since arg->ch is between
14521 '0' and '9', casting to int is safe. */
14522 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14523 PyErr_SetString(PyExc_ValueError,
14524 "width too big");
14525 return -1;
14526 }
14527 arg->width = arg->width*10 + (arg->ch - '0');
14528 }
14529 }
14530
14531 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014532 if (arg->ch == '.') {
14533 arg->prec = 0;
14534 if (--ctx->fmtcnt >= 0) {
14535 arg->ch = FORMAT_READ(ctx);
14536 ctx->fmtpos++;
14537 }
14538 if (arg->ch == '*') {
14539 v = unicode_format_getnextarg(ctx);
14540 if (v == NULL)
14541 return -1;
14542 if (!PyLong_Check(v)) {
14543 PyErr_SetString(PyExc_TypeError,
14544 "* wants int");
14545 return -1;
14546 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014547 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014548 if (arg->prec == -1 && PyErr_Occurred())
14549 return -1;
14550 if (arg->prec < 0)
14551 arg->prec = 0;
14552 if (--ctx->fmtcnt >= 0) {
14553 arg->ch = FORMAT_READ(ctx);
14554 ctx->fmtpos++;
14555 }
14556 }
14557 else if (arg->ch >= '0' && arg->ch <= '9') {
14558 arg->prec = arg->ch - '0';
14559 while (--ctx->fmtcnt >= 0) {
14560 arg->ch = FORMAT_READ(ctx);
14561 ctx->fmtpos++;
14562 if (arg->ch < '0' || arg->ch > '9')
14563 break;
14564 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14565 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014566 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014567 return -1;
14568 }
14569 arg->prec = arg->prec*10 + (arg->ch - '0');
14570 }
14571 }
14572 }
14573
14574 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14575 if (ctx->fmtcnt >= 0) {
14576 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14577 if (--ctx->fmtcnt >= 0) {
14578 arg->ch = FORMAT_READ(ctx);
14579 ctx->fmtpos++;
14580 }
14581 }
14582 }
14583 if (ctx->fmtcnt < 0) {
14584 PyErr_SetString(PyExc_ValueError,
14585 "incomplete format");
14586 return -1;
14587 }
14588 return 0;
14589
14590#undef FORMAT_READ
14591}
14592
14593/* Format one argument. Supported conversion specifiers:
14594
14595 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014596 - "i", "d", "u": int or float
14597 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014598 - "e", "E", "f", "F", "g", "G": float
14599 - "c": int or str (1 character)
14600
Victor Stinner8dbd4212012-12-04 09:30:24 +010014601 When possible, the output is written directly into the Unicode writer
14602 (ctx->writer). A string is created when padding is required.
14603
Victor Stinnera47082312012-10-04 02:19:54 +020014604 Return 0 if the argument has been formatted into *p_str,
14605 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014606 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014607static int
14608unicode_format_arg_format(struct unicode_formatter_t *ctx,
14609 struct unicode_format_arg_t *arg,
14610 PyObject **p_str)
14611{
14612 PyObject *v;
14613 _PyUnicodeWriter *writer = &ctx->writer;
14614
14615 if (ctx->fmtcnt == 0)
14616 ctx->writer.overallocate = 0;
14617
Victor Stinnera47082312012-10-04 02:19:54 +020014618 v = unicode_format_getnextarg(ctx);
14619 if (v == NULL)
14620 return -1;
14621
Victor Stinnera47082312012-10-04 02:19:54 +020014622
14623 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014624 case 's':
14625 case 'r':
14626 case 'a':
14627 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14628 /* Fast path */
14629 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14630 return -1;
14631 return 1;
14632 }
14633
14634 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14635 *p_str = v;
14636 Py_INCREF(*p_str);
14637 }
14638 else {
14639 if (arg->ch == 's')
14640 *p_str = PyObject_Str(v);
14641 else if (arg->ch == 'r')
14642 *p_str = PyObject_Repr(v);
14643 else
14644 *p_str = PyObject_ASCII(v);
14645 }
14646 break;
14647
14648 case 'i':
14649 case 'd':
14650 case 'u':
14651 case 'o':
14652 case 'x':
14653 case 'X':
14654 {
14655 int ret = mainformatlong(v, arg, p_str, writer);
14656 if (ret != 0)
14657 return ret;
14658 arg->sign = 1;
14659 break;
14660 }
14661
14662 case 'e':
14663 case 'E':
14664 case 'f':
14665 case 'F':
14666 case 'g':
14667 case 'G':
14668 if (arg->width == -1 && arg->prec == -1
14669 && !(arg->flags & (F_SIGN | F_BLANK)))
14670 {
14671 /* Fast path */
14672 if (formatfloat(v, arg, NULL, writer) == -1)
14673 return -1;
14674 return 1;
14675 }
14676
14677 arg->sign = 1;
14678 if (formatfloat(v, arg, p_str, NULL) == -1)
14679 return -1;
14680 break;
14681
14682 case 'c':
14683 {
14684 Py_UCS4 ch = formatchar(v);
14685 if (ch == (Py_UCS4) -1)
14686 return -1;
14687 if (arg->width == -1 && arg->prec == -1) {
14688 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014689 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014690 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014691 return 1;
14692 }
14693 *p_str = PyUnicode_FromOrdinal(ch);
14694 break;
14695 }
14696
14697 default:
14698 PyErr_Format(PyExc_ValueError,
14699 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014700 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014701 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14702 (int)arg->ch,
14703 ctx->fmtpos - 1);
14704 return -1;
14705 }
14706 if (*p_str == NULL)
14707 return -1;
14708 assert (PyUnicode_Check(*p_str));
14709 return 0;
14710}
14711
14712static int
14713unicode_format_arg_output(struct unicode_formatter_t *ctx,
14714 struct unicode_format_arg_t *arg,
14715 PyObject *str)
14716{
14717 Py_ssize_t len;
14718 enum PyUnicode_Kind kind;
14719 void *pbuf;
14720 Py_ssize_t pindex;
14721 Py_UCS4 signchar;
14722 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014723 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014724 Py_ssize_t sublen;
14725 _PyUnicodeWriter *writer = &ctx->writer;
14726 Py_UCS4 fill;
14727
14728 fill = ' ';
14729 if (arg->sign && arg->flags & F_ZERO)
14730 fill = '0';
14731
14732 if (PyUnicode_READY(str) == -1)
14733 return -1;
14734
14735 len = PyUnicode_GET_LENGTH(str);
14736 if ((arg->width == -1 || arg->width <= len)
14737 && (arg->prec == -1 || arg->prec >= len)
14738 && !(arg->flags & (F_SIGN | F_BLANK)))
14739 {
14740 /* Fast path */
14741 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14742 return -1;
14743 return 0;
14744 }
14745
14746 /* Truncate the string for "s", "r" and "a" formats
14747 if the precision is set */
14748 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14749 if (arg->prec >= 0 && len > arg->prec)
14750 len = arg->prec;
14751 }
14752
14753 /* Adjust sign and width */
14754 kind = PyUnicode_KIND(str);
14755 pbuf = PyUnicode_DATA(str);
14756 pindex = 0;
14757 signchar = '\0';
14758 if (arg->sign) {
14759 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14760 if (ch == '-' || ch == '+') {
14761 signchar = ch;
14762 len--;
14763 pindex++;
14764 }
14765 else if (arg->flags & F_SIGN)
14766 signchar = '+';
14767 else if (arg->flags & F_BLANK)
14768 signchar = ' ';
14769 else
14770 arg->sign = 0;
14771 }
14772 if (arg->width < len)
14773 arg->width = len;
14774
14775 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014776 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014777 if (!(arg->flags & F_LJUST)) {
14778 if (arg->sign) {
14779 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014780 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014781 }
14782 else {
14783 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014784 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014785 }
14786 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014787 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14788 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014789 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014790 }
14791
Victor Stinnera47082312012-10-04 02:19:54 +020014792 buflen = arg->width;
14793 if (arg->sign && len == arg->width)
14794 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014795 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014796 return -1;
14797
14798 /* Write the sign if needed */
14799 if (arg->sign) {
14800 if (fill != ' ') {
14801 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14802 writer->pos += 1;
14803 }
14804 if (arg->width > len)
14805 arg->width--;
14806 }
14807
14808 /* Write the numeric prefix for "x", "X" and "o" formats
14809 if the alternate form is used.
14810 For example, write "0x" for the "%#x" format. */
14811 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14812 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14813 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14814 if (fill != ' ') {
14815 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14816 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14817 writer->pos += 2;
14818 pindex += 2;
14819 }
14820 arg->width -= 2;
14821 if (arg->width < 0)
14822 arg->width = 0;
14823 len -= 2;
14824 }
14825
14826 /* Pad left with the fill character if needed */
14827 if (arg->width > len && !(arg->flags & F_LJUST)) {
14828 sublen = arg->width - len;
14829 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14830 writer->pos += sublen;
14831 arg->width = len;
14832 }
14833
14834 /* If padding with spaces: write sign if needed and/or numeric prefix if
14835 the alternate form is used */
14836 if (fill == ' ') {
14837 if (arg->sign) {
14838 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14839 writer->pos += 1;
14840 }
14841 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14842 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14843 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14844 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14845 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14846 writer->pos += 2;
14847 pindex += 2;
14848 }
14849 }
14850
14851 /* Write characters */
14852 if (len) {
14853 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14854 str, pindex, len);
14855 writer->pos += len;
14856 }
14857
14858 /* Pad right with the fill character if needed */
14859 if (arg->width > len) {
14860 sublen = arg->width - len;
14861 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14862 writer->pos += sublen;
14863 }
14864 return 0;
14865}
14866
14867/* Helper of PyUnicode_Format(): format one arg.
14868 Return 0 on success, raise an exception and return -1 on error. */
14869static int
14870unicode_format_arg(struct unicode_formatter_t *ctx)
14871{
14872 struct unicode_format_arg_t arg;
14873 PyObject *str;
14874 int ret;
14875
Victor Stinner8dbd4212012-12-04 09:30:24 +010014876 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014877 if (arg.ch == '%') {
14878 ctx->fmtpos++;
14879 ctx->fmtcnt--;
14880 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14881 return -1;
14882 return 0;
14883 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014884 arg.flags = 0;
14885 arg.width = -1;
14886 arg.prec = -1;
14887 arg.sign = 0;
14888 str = NULL;
14889
Victor Stinnera47082312012-10-04 02:19:54 +020014890 ret = unicode_format_arg_parse(ctx, &arg);
14891 if (ret == -1)
14892 return -1;
14893
14894 ret = unicode_format_arg_format(ctx, &arg, &str);
14895 if (ret == -1)
14896 return -1;
14897
14898 if (ret != 1) {
14899 ret = unicode_format_arg_output(ctx, &arg, str);
14900 Py_DECREF(str);
14901 if (ret == -1)
14902 return -1;
14903 }
14904
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014905 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014906 PyErr_SetString(PyExc_TypeError,
14907 "not all arguments converted during string formatting");
14908 return -1;
14909 }
14910 return 0;
14911}
14912
Alexander Belopolsky40018472011-02-26 01:02:56 +000014913PyObject *
14914PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014915{
Victor Stinnera47082312012-10-04 02:19:54 +020014916 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014917
Guido van Rossumd57fd912000-03-10 22:53:23 +000014918 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014919 PyErr_BadInternalCall();
14920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014921 }
Victor Stinnera47082312012-10-04 02:19:54 +020014922
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014923 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014924 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014925
14926 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014927 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14928 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14929 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14930 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014931
Victor Stinner8f674cc2013-04-17 23:02:17 +020014932 _PyUnicodeWriter_Init(&ctx.writer);
14933 ctx.writer.min_length = ctx.fmtcnt + 100;
14934 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014935
Guido van Rossumd57fd912000-03-10 22:53:23 +000014936 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014937 ctx.arglen = PyTuple_Size(args);
14938 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014939 }
14940 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014941 ctx.arglen = -1;
14942 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 }
Victor Stinnera47082312012-10-04 02:19:54 +020014944 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014945 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014946 ctx.dict = args;
14947 else
14948 ctx.dict = NULL;
14949 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014950
Victor Stinnera47082312012-10-04 02:19:54 +020014951 while (--ctx.fmtcnt >= 0) {
14952 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014953 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014954
14955 nonfmtpos = ctx.fmtpos++;
14956 while (ctx.fmtcnt >= 0 &&
14957 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14958 ctx.fmtpos++;
14959 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014960 }
Victor Stinnera47082312012-10-04 02:19:54 +020014961 if (ctx.fmtcnt < 0) {
14962 ctx.fmtpos--;
14963 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014964 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014965
Victor Stinnercfc4c132013-04-03 01:48:39 +020014966 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14967 nonfmtpos, ctx.fmtpos) < 0)
14968 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 }
14970 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014971 ctx.fmtpos++;
14972 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014973 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014974 }
14975 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014976
Victor Stinnera47082312012-10-04 02:19:54 +020014977 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014978 PyErr_SetString(PyExc_TypeError,
14979 "not all arguments converted during string formatting");
14980 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981 }
14982
Victor Stinnera47082312012-10-04 02:19:54 +020014983 if (ctx.args_owned) {
14984 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014985 }
Victor Stinnera47082312012-10-04 02:19:54 +020014986 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987
Benjamin Peterson29060642009-01-31 22:14:21 +000014988 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014989 _PyUnicodeWriter_Dealloc(&ctx.writer);
14990 if (ctx.args_owned) {
14991 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014992 }
14993 return NULL;
14994}
14995
Jeremy Hylton938ace62002-07-17 16:30:39 +000014996static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014997unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14998
Tim Peters6d6c1a32001-08-02 04:15:00 +000014999static PyObject *
15000unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15001{
Benjamin Peterson29060642009-01-31 22:14:21 +000015002 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015003 static char *kwlist[] = {"object", "encoding", "errors", 0};
15004 char *encoding = NULL;
15005 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015006
Benjamin Peterson14339b62009-01-31 16:36:08 +000015007 if (type != &PyUnicode_Type)
15008 return unicode_subtype_new(type, args, kwds);
15009 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015010 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015011 return NULL;
15012 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015013 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 if (encoding == NULL && errors == NULL)
15015 return PyObject_Str(x);
15016 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015017 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015018}
15019
Guido van Rossume023fe02001-08-30 03:12:59 +000015020static PyObject *
15021unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15022{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015023 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015024 Py_ssize_t length, char_size;
15025 int share_wstr, share_utf8;
15026 unsigned int kind;
15027 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015028
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015030
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015031 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015032 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015034 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015035 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015036 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015037 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015038 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015039
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015040 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041 if (self == NULL) {
15042 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015043 return NULL;
15044 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015045 kind = PyUnicode_KIND(unicode);
15046 length = PyUnicode_GET_LENGTH(unicode);
15047
15048 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015049#ifdef Py_DEBUG
15050 _PyUnicode_HASH(self) = -1;
15051#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015052 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015053#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 _PyUnicode_STATE(self).interned = 0;
15055 _PyUnicode_STATE(self).kind = kind;
15056 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015057 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 _PyUnicode_STATE(self).ready = 1;
15059 _PyUnicode_WSTR(self) = NULL;
15060 _PyUnicode_UTF8_LENGTH(self) = 0;
15061 _PyUnicode_UTF8(self) = NULL;
15062 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015063 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015064
15065 share_utf8 = 0;
15066 share_wstr = 0;
15067 if (kind == PyUnicode_1BYTE_KIND) {
15068 char_size = 1;
15069 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15070 share_utf8 = 1;
15071 }
15072 else if (kind == PyUnicode_2BYTE_KIND) {
15073 char_size = 2;
15074 if (sizeof(wchar_t) == 2)
15075 share_wstr = 1;
15076 }
15077 else {
15078 assert(kind == PyUnicode_4BYTE_KIND);
15079 char_size = 4;
15080 if (sizeof(wchar_t) == 4)
15081 share_wstr = 1;
15082 }
15083
15084 /* Ensure we won't overflow the length. */
15085 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15086 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015087 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015089 data = PyObject_MALLOC((length + 1) * char_size);
15090 if (data == NULL) {
15091 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015092 goto onError;
15093 }
15094
Victor Stinnerc3c74152011-10-02 20:39:55 +020015095 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015096 if (share_utf8) {
15097 _PyUnicode_UTF8_LENGTH(self) = length;
15098 _PyUnicode_UTF8(self) = data;
15099 }
15100 if (share_wstr) {
15101 _PyUnicode_WSTR_LENGTH(self) = length;
15102 _PyUnicode_WSTR(self) = (wchar_t *)data;
15103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015104
Christian Heimesf051e432016-09-13 20:22:02 +020015105 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015106 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015107 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015108#ifdef Py_DEBUG
15109 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15110#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015111 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015112 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015113
15114onError:
15115 Py_DECREF(unicode);
15116 Py_DECREF(self);
15117 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015118}
15119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015120PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015121"str(object='') -> str\n\
15122str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015123\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015124Create a new string object from the given object. If encoding or\n\
15125errors is specified, then the object must expose a data buffer\n\
15126that will be decoded using the given encoding and error handler.\n\
15127Otherwise, returns the result of object.__str__() (if defined)\n\
15128or repr(object).\n\
15129encoding defaults to sys.getdefaultencoding().\n\
15130errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015131
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015132static PyObject *unicode_iter(PyObject *seq);
15133
Guido van Rossumd57fd912000-03-10 22:53:23 +000015134PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015135 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 "str", /* tp_name */
15137 sizeof(PyUnicodeObject), /* tp_size */
15138 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015139 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 (destructor)unicode_dealloc, /* tp_dealloc */
15141 0, /* tp_print */
15142 0, /* tp_getattr */
15143 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015144 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015145 unicode_repr, /* tp_repr */
15146 &unicode_as_number, /* tp_as_number */
15147 &unicode_as_sequence, /* tp_as_sequence */
15148 &unicode_as_mapping, /* tp_as_mapping */
15149 (hashfunc) unicode_hash, /* tp_hash*/
15150 0, /* tp_call*/
15151 (reprfunc) unicode_str, /* tp_str */
15152 PyObject_GenericGetAttr, /* tp_getattro */
15153 0, /* tp_setattro */
15154 0, /* tp_as_buffer */
15155 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015156 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 unicode_doc, /* tp_doc */
15158 0, /* tp_traverse */
15159 0, /* tp_clear */
15160 PyUnicode_RichCompare, /* tp_richcompare */
15161 0, /* tp_weaklistoffset */
15162 unicode_iter, /* tp_iter */
15163 0, /* tp_iternext */
15164 unicode_methods, /* tp_methods */
15165 0, /* tp_members */
15166 0, /* tp_getset */
15167 &PyBaseObject_Type, /* tp_base */
15168 0, /* tp_dict */
15169 0, /* tp_descr_get */
15170 0, /* tp_descr_set */
15171 0, /* tp_dictoffset */
15172 0, /* tp_init */
15173 0, /* tp_alloc */
15174 unicode_new, /* tp_new */
15175 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015176};
15177
15178/* Initialize the Unicode implementation */
15179
Victor Stinner3a50e702011-10-18 21:21:00 +020015180int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015181{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015182 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015183 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015184 0x000A, /* LINE FEED */
15185 0x000D, /* CARRIAGE RETURN */
15186 0x001C, /* FILE SEPARATOR */
15187 0x001D, /* GROUP SEPARATOR */
15188 0x001E, /* RECORD SEPARATOR */
15189 0x0085, /* NEXT LINE */
15190 0x2028, /* LINE SEPARATOR */
15191 0x2029, /* PARAGRAPH SEPARATOR */
15192 };
15193
Fred Drakee4315f52000-05-09 19:53:39 +000015194 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015195 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015196 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015197 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015198 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015199
Guido van Rossumcacfc072002-05-24 19:01:59 +000015200 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015201 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015202
15203 /* initialize the linebreak bloom filter */
15204 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015205 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015206 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015207
Christian Heimes26532f72013-07-20 14:57:16 +020015208 if (PyType_Ready(&EncodingMapType) < 0)
15209 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015210
Benjamin Petersonc4311282012-10-30 23:21:10 -040015211 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15212 Py_FatalError("Can't initialize field name iterator type");
15213
15214 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15215 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015216
Victor Stinner3a50e702011-10-18 21:21:00 +020015217 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015218}
15219
15220/* Finalize the Unicode implementation */
15221
Christian Heimesa156e092008-02-16 07:38:31 +000015222int
15223PyUnicode_ClearFreeList(void)
15224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015225 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015226}
15227
Guido van Rossumd57fd912000-03-10 22:53:23 +000015228void
Thomas Wouters78890102000-07-22 19:25:51 +000015229_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015230{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015231 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015232
Serhiy Storchaka05997252013-01-26 12:14:02 +020015233 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015234
Serhiy Storchaka05997252013-01-26 12:14:02 +020015235 for (i = 0; i < 256; i++)
15236 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015237 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015238 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015239}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015240
Walter Dörwald16807132007-05-25 13:52:07 +000015241void
15242PyUnicode_InternInPlace(PyObject **p)
15243{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015244 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015246#ifdef Py_DEBUG
15247 assert(s != NULL);
15248 assert(_PyUnicode_CHECK(s));
15249#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015250 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015251 return;
15252#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 /* If it's a subclass, we don't really know what putting
15254 it in the interned dict might do. */
15255 if (!PyUnicode_CheckExact(s))
15256 return;
15257 if (PyUnicode_CHECK_INTERNED(s))
15258 return;
15259 if (interned == NULL) {
15260 interned = PyDict_New();
15261 if (interned == NULL) {
15262 PyErr_Clear(); /* Don't leave an exception */
15263 return;
15264 }
15265 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015267 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015269 if (t == NULL) {
15270 PyErr_Clear();
15271 return;
15272 }
15273 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015274 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015275 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015276 return;
15277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 /* The two references in interned are not counted by refcnt.
15279 The deallocator will take care of this */
15280 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015281 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015282}
15283
15284void
15285PyUnicode_InternImmortal(PyObject **p)
15286{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 PyUnicode_InternInPlace(p);
15288 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015289 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 Py_INCREF(*p);
15291 }
Walter Dörwald16807132007-05-25 13:52:07 +000015292}
15293
15294PyObject *
15295PyUnicode_InternFromString(const char *cp)
15296{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 PyObject *s = PyUnicode_FromString(cp);
15298 if (s == NULL)
15299 return NULL;
15300 PyUnicode_InternInPlace(&s);
15301 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015302}
15303
Alexander Belopolsky40018472011-02-26 01:02:56 +000015304void
15305_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015308 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 Py_ssize_t i, n;
15310 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015311
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 if (interned == NULL || !PyDict_Check(interned))
15313 return;
15314 keys = PyDict_Keys(interned);
15315 if (keys == NULL || !PyList_Check(keys)) {
15316 PyErr_Clear();
15317 return;
15318 }
Walter Dörwald16807132007-05-25 13:52:07 +000015319
Benjamin Peterson14339b62009-01-31 16:36:08 +000015320 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15321 detector, interned unicode strings are not forcibly deallocated;
15322 rather, we give them their stolen references back, and then clear
15323 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015324
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 n = PyList_GET_SIZE(keys);
15326 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015327 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015329 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015330 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015331 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015333 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 case SSTATE_NOT_INTERNED:
15335 /* XXX Shouldn't happen */
15336 break;
15337 case SSTATE_INTERNED_IMMORTAL:
15338 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015339 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 break;
15341 case SSTATE_INTERNED_MORTAL:
15342 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 break;
15345 default:
15346 Py_FatalError("Inconsistent interned string state.");
15347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015348 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 }
15350 fprintf(stderr, "total size of all interned strings: "
15351 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15352 "mortal/immortal\n", mortal_size, immortal_size);
15353 Py_DECREF(keys);
15354 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015355 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015356}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015357
15358
15359/********************* Unicode Iterator **************************/
15360
15361typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 PyObject_HEAD
15363 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015364 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015365} unicodeiterobject;
15366
15367static void
15368unicodeiter_dealloc(unicodeiterobject *it)
15369{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 _PyObject_GC_UNTRACK(it);
15371 Py_XDECREF(it->it_seq);
15372 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015373}
15374
15375static int
15376unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15377{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 Py_VISIT(it->it_seq);
15379 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015380}
15381
15382static PyObject *
15383unicodeiter_next(unicodeiterobject *it)
15384{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015385 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015386
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 assert(it != NULL);
15388 seq = it->it_seq;
15389 if (seq == NULL)
15390 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015391 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015393 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15394 int kind = PyUnicode_KIND(seq);
15395 void *data = PyUnicode_DATA(seq);
15396 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15397 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 if (item != NULL)
15399 ++it->it_index;
15400 return item;
15401 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015402
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015404 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015406}
15407
15408static PyObject *
15409unicodeiter_len(unicodeiterobject *it)
15410{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015411 Py_ssize_t len = 0;
15412 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015413 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015415}
15416
15417PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15418
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015419static PyObject *
15420unicodeiter_reduce(unicodeiterobject *it)
15421{
15422 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015423 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015424 it->it_seq, it->it_index);
15425 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015426 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015427 if (u == NULL)
15428 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015429 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015430 }
15431}
15432
15433PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15434
15435static PyObject *
15436unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15437{
15438 Py_ssize_t index = PyLong_AsSsize_t(state);
15439 if (index == -1 && PyErr_Occurred())
15440 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015441 if (it->it_seq != NULL) {
15442 if (index < 0)
15443 index = 0;
15444 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15445 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15446 it->it_index = index;
15447 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015448 Py_RETURN_NONE;
15449}
15450
15451PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15452
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015453static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015455 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015456 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15457 reduce_doc},
15458 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15459 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015461};
15462
15463PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015464 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15465 "str_iterator", /* tp_name */
15466 sizeof(unicodeiterobject), /* tp_basicsize */
15467 0, /* tp_itemsize */
15468 /* methods */
15469 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15470 0, /* tp_print */
15471 0, /* tp_getattr */
15472 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015473 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 0, /* tp_repr */
15475 0, /* tp_as_number */
15476 0, /* tp_as_sequence */
15477 0, /* tp_as_mapping */
15478 0, /* tp_hash */
15479 0, /* tp_call */
15480 0, /* tp_str */
15481 PyObject_GenericGetAttr, /* tp_getattro */
15482 0, /* tp_setattro */
15483 0, /* tp_as_buffer */
15484 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15485 0, /* tp_doc */
15486 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15487 0, /* tp_clear */
15488 0, /* tp_richcompare */
15489 0, /* tp_weaklistoffset */
15490 PyObject_SelfIter, /* tp_iter */
15491 (iternextfunc)unicodeiter_next, /* tp_iternext */
15492 unicodeiter_methods, /* tp_methods */
15493 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015494};
15495
15496static PyObject *
15497unicode_iter(PyObject *seq)
15498{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015499 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015500
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 if (!PyUnicode_Check(seq)) {
15502 PyErr_BadInternalCall();
15503 return NULL;
15504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015505 if (PyUnicode_READY(seq) == -1)
15506 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015507 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15508 if (it == NULL)
15509 return NULL;
15510 it->it_index = 0;
15511 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015512 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015513 _PyObject_GC_TRACK(it);
15514 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015515}
15516
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015517
15518size_t
15519Py_UNICODE_strlen(const Py_UNICODE *u)
15520{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015521 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015522}
15523
15524Py_UNICODE*
15525Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15526{
15527 Py_UNICODE *u = s1;
15528 while ((*u++ = *s2++));
15529 return s1;
15530}
15531
15532Py_UNICODE*
15533Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15534{
15535 Py_UNICODE *u = s1;
15536 while ((*u++ = *s2++))
15537 if (n-- == 0)
15538 break;
15539 return s1;
15540}
15541
15542Py_UNICODE*
15543Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15544{
15545 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015546 u1 += wcslen(u1);
15547 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015548 return s1;
15549}
15550
15551int
15552Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15553{
15554 while (*s1 && *s2 && *s1 == *s2)
15555 s1++, s2++;
15556 if (*s1 && *s2)
15557 return (*s1 < *s2) ? -1 : +1;
15558 if (*s1)
15559 return 1;
15560 if (*s2)
15561 return -1;
15562 return 0;
15563}
15564
15565int
15566Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15567{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015568 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015569 for (; n != 0; n--) {
15570 u1 = *s1;
15571 u2 = *s2;
15572 if (u1 != u2)
15573 return (u1 < u2) ? -1 : +1;
15574 if (u1 == '\0')
15575 return 0;
15576 s1++;
15577 s2++;
15578 }
15579 return 0;
15580}
15581
15582Py_UNICODE*
15583Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15584{
15585 const Py_UNICODE *p;
15586 for (p = s; *p; p++)
15587 if (*p == c)
15588 return (Py_UNICODE*)p;
15589 return NULL;
15590}
15591
15592Py_UNICODE*
15593Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15594{
15595 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015596 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015597 while (p != s) {
15598 p--;
15599 if (*p == c)
15600 return (Py_UNICODE*)p;
15601 }
15602 return NULL;
15603}
Victor Stinner331ea922010-08-10 16:37:20 +000015604
Victor Stinner71133ff2010-09-01 23:43:53 +000015605Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015606PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015607{
Victor Stinner577db2c2011-10-11 22:12:48 +020015608 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015609 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015611 if (!PyUnicode_Check(unicode)) {
15612 PyErr_BadArgument();
15613 return NULL;
15614 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015615 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015616 if (u == NULL)
15617 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015618 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015619 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015620 PyErr_NoMemory();
15621 return NULL;
15622 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015623 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015624 size *= sizeof(Py_UNICODE);
15625 copy = PyMem_Malloc(size);
15626 if (copy == NULL) {
15627 PyErr_NoMemory();
15628 return NULL;
15629 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015630 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015631 return copy;
15632}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015633
Georg Brandl66c221e2010-10-14 07:04:07 +000015634/* A _string module, to export formatter_parser and formatter_field_name_split
15635 to the string.Formatter class implemented in Python. */
15636
15637static PyMethodDef _string_methods[] = {
15638 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15639 METH_O, PyDoc_STR("split the argument as a field name")},
15640 {"formatter_parser", (PyCFunction) formatter_parser,
15641 METH_O, PyDoc_STR("parse the argument as a format string")},
15642 {NULL, NULL}
15643};
15644
15645static struct PyModuleDef _string_module = {
15646 PyModuleDef_HEAD_INIT,
15647 "_string",
15648 PyDoc_STR("string helper module"),
15649 0,
15650 _string_methods,
15651 NULL,
15652 NULL,
15653 NULL,
15654 NULL
15655};
15656
15657PyMODINIT_FUNC
15658PyInit__string(void)
15659{
15660 return PyModule_Create(&_string_module);
15661}
15662
15663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015664#ifdef __cplusplus
15665}
15666#endif