blob: e6371d2337c3dac2f4b98ce6f245752362a66cc9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100221#define FILL(kind, data, value, start, length) \
222 do { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100223 assert(0 <= start); \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100224 assert(kind != PyUnicode_WCHAR_KIND); \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100225 switch (kind) { \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100226 case PyUnicode_1BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100227 assert(value <= 0xff); \
228 Py_UCS1 ch = (unsigned char)value; \
229 Py_UCS1 *to = (Py_UCS1 *)data + start; \
230 memset(to, ch, length); \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100231 break; \
232 } \
233 case PyUnicode_2BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100234 assert(value <= 0xffff); \
235 Py_UCS2 ch = (Py_UCS2)value; \
236 Py_UCS2 *to = (Py_UCS2 *)data + start; \
237 const Py_UCS2 *end = to + length; \
238 for (; to < end; ++to) *to = ch; \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100239 break; \
240 } \
241 case PyUnicode_4BYTE_KIND: { \
Victor Stinner7f9fb0f2018-11-27 12:42:04 +0100242 assert(value <= MAX_UNICODE); \
243 Py_UCS4 ch = value; \
244 Py_UCS4 * to = (Py_UCS4 *)data + start; \
245 const Py_UCS4 *end = to + length; \
246 for (; to < end; ++to) *to = ch; \
Victor Stinner6f5fa1b2018-11-26 14:17:01 +0100247 break; \
248 } \
249 default: Py_UNREACHABLE(); \
250 } \
251 } while (0)
252
253
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200254/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700255static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200256_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200258/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200259static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200260
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261/* Single character Unicode strings in the Latin-1 range are being
262 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200263static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Fast detection of the most frequent whitespace characters */
266const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000268/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* case 0x000C: * FORM FEED */
272/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 1, 1, 1, 1, 1, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* case 0x001C: * FILE SEPARATOR */
276/* case 0x001D: * GROUP SEPARATOR */
277/* case 0x001E: * RECORD SEPARATOR */
278/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 1, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000285
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000294};
295
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200296/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200297static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200298static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100299static int unicode_modifiable(PyObject *unicode);
300
Victor Stinnerfe226c02011-10-03 03:52:20 +0200301
Alexander Belopolsky40018472011-02-26 01:02:56 +0000302static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100303_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200304static PyObject *
305_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306static PyObject *
307_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308
309static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000311 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100312 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000313 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314
Alexander Belopolsky40018472011-02-26 01:02:56 +0000315static void
316raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300317 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100318 PyObject *unicode,
319 Py_ssize_t startpos, Py_ssize_t endpos,
320 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000321
Christian Heimes190d79e2008-01-30 11:58:22 +0000322/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200323static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000324 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000325/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000326/* 0x000B, * LINE TABULATION */
327/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000328/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000329 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000330 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000331/* 0x001C, * FILE SEPARATOR */
332/* 0x001D, * GROUP SEPARATOR */
333/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000334 0, 0, 0, 0, 1, 1, 1, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000339
Benjamin Peterson14339b62009-01-31 16:36:08 +0000340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000348};
349
INADA Naoki3ae20562017-01-16 20:41:20 +0900350static int convert_uc(PyObject *obj, void *addr);
351
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300352#include "clinic/unicodeobject.c.h"
353
Victor Stinner50149202015-09-22 00:26:54 +0200354typedef enum {
355 _Py_ERROR_UNKNOWN=0,
356 _Py_ERROR_STRICT,
357 _Py_ERROR_SURROGATEESCAPE,
358 _Py_ERROR_REPLACE,
359 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200360 _Py_ERROR_BACKSLASHREPLACE,
361 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200362 _Py_ERROR_XMLCHARREFREPLACE,
363 _Py_ERROR_OTHER
364} _Py_error_handler;
365
366static _Py_error_handler
367get_error_handler(const char *errors)
368{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200369 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200370 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200371 }
372 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200373 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200374 }
375 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200379 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200382 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
Victor Stinner50149202015-09-22 00:26:54 +0200390 return _Py_ERROR_OTHER;
391}
392
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300393/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000395Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000396PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000397{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000398#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000399 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000400#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000401 /* This is actually an illegal character, so it should
402 not be passed to unichr. */
403 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000404#endif
405}
406
Victor Stinner910337b2011-10-03 03:20:16 +0200407#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200408int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100409_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200410{
411 PyASCIIObject *ascii;
412 unsigned int kind;
413
414 assert(PyUnicode_Check(op));
415
416 ascii = (PyASCIIObject *)op;
417 kind = ascii->state.kind;
418
Victor Stinnera3b334d2011-10-03 13:53:37 +0200419 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200420 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200421 assert(ascii->state.ready == 1);
422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200424 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200425 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200426
Victor Stinnera41463c2011-10-04 01:05:08 +0200427 if (ascii->state.compact == 1) {
428 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200429 assert(kind == PyUnicode_1BYTE_KIND
430 || kind == PyUnicode_2BYTE_KIND
431 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200432 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200433 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200434 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100435 }
436 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200437 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438
439 data = unicode->data.any;
440 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100441 assert(ascii->length == 0);
442 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200443 assert(ascii->state.compact == 0);
444 assert(ascii->state.ascii == 0);
445 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100446 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200447 assert(ascii->wstr != NULL);
448 assert(data == NULL);
449 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200450 }
451 else {
452 assert(kind == PyUnicode_1BYTE_KIND
453 || kind == PyUnicode_2BYTE_KIND
454 || kind == PyUnicode_4BYTE_KIND);
455 assert(ascii->state.compact == 0);
456 assert(ascii->state.ready == 1);
457 assert(data != NULL);
458 if (ascii->state.ascii) {
459 assert (compact->utf8 == data);
460 assert (compact->utf8_length == ascii->length);
461 }
462 else
463 assert (compact->utf8 != data);
464 }
465 }
466 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200467 if (
468#if SIZEOF_WCHAR_T == 2
469 kind == PyUnicode_2BYTE_KIND
470#else
471 kind == PyUnicode_4BYTE_KIND
472#endif
473 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200474 {
475 assert(ascii->wstr == data);
476 assert(compact->wstr_length == ascii->length);
477 } else
478 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200479 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200480
481 if (compact->utf8 == NULL)
482 assert(compact->utf8_length == 0);
483 if (ascii->wstr == NULL)
484 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200485 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200486 /* check that the best kind is used */
487 if (check_content && kind != PyUnicode_WCHAR_KIND)
488 {
489 Py_ssize_t i;
490 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200491 void *data;
492 Py_UCS4 ch;
493
494 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200495 for (i=0; i < ascii->length; i++)
496 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200497 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200498 if (ch > maxchar)
499 maxchar = ch;
500 }
501 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100502 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200503 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100504 assert(maxchar <= 255);
505 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200506 else
507 assert(maxchar < 128);
508 }
Victor Stinner77faf692011-11-20 18:56:05 +0100509 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200510 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100511 assert(maxchar <= 0xFFFF);
512 }
513 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200514 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100515 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100516 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200517 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200518 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400519 return 1;
520}
Victor Stinner910337b2011-10-03 03:20:16 +0200521#endif
522
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100523static PyObject*
524unicode_result_wchar(PyObject *unicode)
525{
526#ifndef Py_DEBUG
527 Py_ssize_t len;
528
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100529 len = _PyUnicode_WSTR_LENGTH(unicode);
530 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100531 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200532 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 }
534
535 if (len == 1) {
536 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100537 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100538 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539 Py_DECREF(unicode);
540 return latin1_char;
541 }
542 }
543
544 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200545 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100546 return NULL;
547 }
548#else
Victor Stinneraa771272012-10-04 02:32:58 +0200549 assert(Py_REFCNT(unicode) == 1);
550
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100551 /* don't make the result ready in debug mode to ensure that the caller
552 makes the string ready before using it */
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554#endif
555 return unicode;
556}
557
558static PyObject*
559unicode_result_ready(PyObject *unicode)
560{
561 Py_ssize_t length;
562
563 length = PyUnicode_GET_LENGTH(unicode);
564 if (length == 0) {
565 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100566 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200567 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100568 }
569 return unicode_empty;
570 }
571
572 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200573 void *data = PyUnicode_DATA(unicode);
574 int kind = PyUnicode_KIND(unicode);
575 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100576 if (ch < 256) {
577 PyObject *latin1_char = unicode_latin1[ch];
578 if (latin1_char != NULL) {
579 if (unicode != latin1_char) {
580 Py_INCREF(latin1_char);
581 Py_DECREF(unicode);
582 }
583 return latin1_char;
584 }
585 else {
586 assert(_PyUnicode_CheckConsistency(unicode, 1));
587 Py_INCREF(unicode);
588 unicode_latin1[ch] = unicode;
589 return unicode;
590 }
591 }
592 }
593
594 assert(_PyUnicode_CheckConsistency(unicode, 1));
595 return unicode;
596}
597
598static PyObject*
599unicode_result(PyObject *unicode)
600{
601 assert(_PyUnicode_CHECK(unicode));
602 if (PyUnicode_IS_READY(unicode))
603 return unicode_result_ready(unicode);
604 else
605 return unicode_result_wchar(unicode);
606}
607
Victor Stinnerc4b49542011-12-11 22:44:26 +0100608static PyObject*
609unicode_result_unchanged(PyObject *unicode)
610{
611 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500612 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100613 return NULL;
614 Py_INCREF(unicode);
615 return unicode;
616 }
617 else
618 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100619 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100620}
621
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623 ASCII, Latin1, UTF-8, etc. */
624static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200625backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200626 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627{
Victor Stinnerad771582015-10-09 12:38:53 +0200628 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629 Py_UCS4 ch;
630 enum PyUnicode_Kind kind;
631 void *data;
632
633 assert(PyUnicode_IS_READY(unicode));
634 kind = PyUnicode_KIND(unicode);
635 data = PyUnicode_DATA(unicode);
636
637 size = 0;
638 /* determine replacement size */
639 for (i = collstart; i < collend; ++i) {
640 Py_ssize_t incr;
641
642 ch = PyUnicode_READ(kind, data, i);
643 if (ch < 0x100)
644 incr = 2+2;
645 else if (ch < 0x10000)
646 incr = 2+4;
647 else {
648 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200649 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200650 }
651 if (size > PY_SSIZE_T_MAX - incr) {
652 PyErr_SetString(PyExc_OverflowError,
653 "encoded result is too long for a Python string");
654 return NULL;
655 }
656 size += incr;
657 }
658
Victor Stinnerad771582015-10-09 12:38:53 +0200659 str = _PyBytesWriter_Prepare(writer, str, size);
660 if (str == NULL)
661 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662
663 /* generate replacement */
664 for (i = collstart; i < collend; ++i) {
665 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200666 *str++ = '\\';
667 if (ch >= 0x00010000) {
668 *str++ = 'U';
669 *str++ = Py_hexdigits[(ch>>28)&0xf];
670 *str++ = Py_hexdigits[(ch>>24)&0xf];
671 *str++ = Py_hexdigits[(ch>>20)&0xf];
672 *str++ = Py_hexdigits[(ch>>16)&0xf];
673 *str++ = Py_hexdigits[(ch>>12)&0xf];
674 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200675 }
Victor Stinner797485e2015-10-09 03:17:30 +0200676 else if (ch >= 0x100) {
677 *str++ = 'u';
678 *str++ = Py_hexdigits[(ch>>12)&0xf];
679 *str++ = Py_hexdigits[(ch>>8)&0xf];
680 }
681 else
682 *str++ = 'x';
683 *str++ = Py_hexdigits[(ch>>4)&0xf];
684 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200685 }
686 return str;
687}
688
689/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690 ASCII, Latin1, UTF-8, etc. */
691static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200692xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200693 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694{
Victor Stinnerad771582015-10-09 12:38:53 +0200695 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696 Py_UCS4 ch;
697 enum PyUnicode_Kind kind;
698 void *data;
699
700 assert(PyUnicode_IS_READY(unicode));
701 kind = PyUnicode_KIND(unicode);
702 data = PyUnicode_DATA(unicode);
703
704 size = 0;
705 /* determine replacement size */
706 for (i = collstart; i < collend; ++i) {
707 Py_ssize_t incr;
708
709 ch = PyUnicode_READ(kind, data, i);
710 if (ch < 10)
711 incr = 2+1+1;
712 else if (ch < 100)
713 incr = 2+2+1;
714 else if (ch < 1000)
715 incr = 2+3+1;
716 else if (ch < 10000)
717 incr = 2+4+1;
718 else if (ch < 100000)
719 incr = 2+5+1;
720 else if (ch < 1000000)
721 incr = 2+6+1;
722 else {
723 assert(ch <= MAX_UNICODE);
724 incr = 2+7+1;
725 }
726 if (size > PY_SSIZE_T_MAX - incr) {
727 PyErr_SetString(PyExc_OverflowError,
728 "encoded result is too long for a Python string");
729 return NULL;
730 }
731 size += incr;
732 }
733
Victor Stinnerad771582015-10-09 12:38:53 +0200734 str = _PyBytesWriter_Prepare(writer, str, size);
735 if (str == NULL)
736 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200737
738 /* generate replacement */
739 for (i = collstart; i < collend; ++i) {
740 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741 }
742 return str;
743}
744
Thomas Wouters477c8d52006-05-27 19:21:47 +0000745/* --- Bloom Filters ----------------------------------------------------- */
746
747/* stuff to implement simple "bloom filters" for Unicode characters.
748 to keep things simple, we use a single bitmask, using the least 5
749 bits from each unicode characters as the bit index. */
750
751/* the linebreak mask is set up by Unicode_Init below */
752
Antoine Pitrouf068f942010-01-13 14:19:12 +0000753#if LONG_BIT >= 128
754#define BLOOM_WIDTH 128
755#elif LONG_BIT >= 64
756#define BLOOM_WIDTH 64
757#elif LONG_BIT >= 32
758#define BLOOM_WIDTH 32
759#else
760#error "LONG_BIT is smaller than 32"
761#endif
762
Thomas Wouters477c8d52006-05-27 19:21:47 +0000763#define BLOOM_MASK unsigned long
764
Serhiy Storchaka05997252013-01-26 12:14:02 +0200765static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000766
Antoine Pitrouf068f942010-01-13 14:19:12 +0000767#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000768
Benjamin Peterson29060642009-01-31 22:14:21 +0000769#define BLOOM_LINEBREAK(ch) \
770 ((ch) < 128U ? ascii_linebreak[(ch)] : \
771 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000772
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700773static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200774make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000775{
Victor Stinnera85af502013-04-09 21:53:54 +0200776#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
777 do { \
778 TYPE *data = (TYPE *)PTR; \
779 TYPE *end = data + LEN; \
780 Py_UCS4 ch; \
781 for (; data != end; data++) { \
782 ch = *data; \
783 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784 } \
785 break; \
786 } while (0)
787
Thomas Wouters477c8d52006-05-27 19:21:47 +0000788 /* calculate simple bloom-style bitmask for a given unicode string */
789
Antoine Pitrouf068f942010-01-13 14:19:12 +0000790 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000791
792 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200793 switch (kind) {
794 case PyUnicode_1BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796 break;
797 case PyUnicode_2BYTE_KIND:
798 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799 break;
800 case PyUnicode_4BYTE_KIND:
801 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802 break;
803 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700804 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200805 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200807
808#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809}
810
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300811static int
812ensure_unicode(PyObject *obj)
813{
814 if (!PyUnicode_Check(obj)) {
815 PyErr_Format(PyExc_TypeError,
816 "must be str, not %.100s",
817 Py_TYPE(obj)->tp_name);
818 return -1;
819 }
820 return PyUnicode_READY(obj);
821}
822
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200823/* Compilation of templated routines */
824
825#include "stringlib/asciilib.h"
826#include "stringlib/fastsearch.h"
827#include "stringlib/partition.h"
828#include "stringlib/split.h"
829#include "stringlib/count.h"
830#include "stringlib/find.h"
831#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200832#include "stringlib/undef.h"
833
834#include "stringlib/ucs1lib.h"
835#include "stringlib/fastsearch.h"
836#include "stringlib/partition.h"
837#include "stringlib/split.h"
838#include "stringlib/count.h"
839#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300840#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200841#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200842#include "stringlib/undef.h"
843
844#include "stringlib/ucs2lib.h"
845#include "stringlib/fastsearch.h"
846#include "stringlib/partition.h"
847#include "stringlib/split.h"
848#include "stringlib/count.h"
849#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300850#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200851#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200852#include "stringlib/undef.h"
853
854#include "stringlib/ucs4lib.h"
855#include "stringlib/fastsearch.h"
856#include "stringlib/partition.h"
857#include "stringlib/split.h"
858#include "stringlib/count.h"
859#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300860#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200861#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200862#include "stringlib/undef.h"
863
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200864#include "stringlib/unicodedefs.h"
865#include "stringlib/fastsearch.h"
866#include "stringlib/count.h"
867#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100868#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200869
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870/* --- Unicode Object ----------------------------------------------------- */
871
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700872static inline Py_ssize_t
873findchar(const void *s, int kind,
874 Py_ssize_t size, Py_UCS4 ch,
875 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200877 switch (kind) {
878 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200879 if ((Py_UCS1) ch != ch)
880 return -1;
881 if (direction > 0)
882 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883 else
884 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200885 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200886 if ((Py_UCS2) ch != ch)
887 return -1;
888 if (direction > 0)
889 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890 else
891 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200892 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200893 if (direction > 0)
894 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895 else
896 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700898 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901
Victor Stinnerafffce42012-10-03 23:03:17 +0200902#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000903/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200904 earlier.
905
906 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908 invalid character in Unicode 6.0. */
909static void
910unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911{
912 int kind = PyUnicode_KIND(unicode);
913 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915 if (length <= old_length)
916 return;
917 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918}
919#endif
920
Victor Stinnerfe226c02011-10-03 03:52:20 +0200921static PyObject*
922resize_compact(PyObject *unicode, Py_ssize_t length)
923{
924 Py_ssize_t char_size;
925 Py_ssize_t struct_size;
926 Py_ssize_t new_size;
927 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100928 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200929#ifdef Py_DEBUG
930 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931#endif
932
Victor Stinner79891572012-05-03 13:43:07 +0200933 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200934 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100935 assert(PyUnicode_IS_COMPACT(unicode));
936
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200937 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100938 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 struct_size = sizeof(PyASCIIObject);
940 else
941 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943
Victor Stinnerfe226c02011-10-03 03:52:20 +0200944 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945 PyErr_NoMemory();
946 return NULL;
947 }
948 new_size = (struct_size + (length + 1) * char_size);
949
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951 PyObject_DEL(_PyUnicode_UTF8(unicode));
952 _PyUnicode_UTF8(unicode) = NULL;
953 _PyUnicode_UTF8_LENGTH(unicode) = 0;
954 }
Victor Stinner84def372011-12-11 20:04:56 +0100955 _Py_DEC_REFTOTAL;
956 _Py_ForgetReference(unicode);
957
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300958 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100959 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100960 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 PyErr_NoMemory();
962 return NULL;
963 }
Victor Stinner84def372011-12-11 20:04:56 +0100964 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100966
Victor Stinnerfe226c02011-10-03 03:52:20 +0200967 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200968 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200969 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100970 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200971 _PyUnicode_WSTR_LENGTH(unicode) = length;
972 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100973 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974 PyObject_DEL(_PyUnicode_WSTR(unicode));
975 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100976 if (!PyUnicode_IS_ASCII(unicode))
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100978 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200979#ifdef Py_DEBUG
980 unicode_fill_invalid(unicode, old_length);
981#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200984 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200985 return unicode;
986}
987
Alexander Belopolsky40018472011-02-26 01:02:56 +0000988static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200989resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990{
Victor Stinner95663112011-10-04 01:03:50 +0200991 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100992 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200994 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 if (PyUnicode_IS_READY(unicode)) {
997 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200998 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001000#ifdef Py_DEBUG
1001 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003
1004 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001005 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001006 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008
1009 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010 PyErr_NoMemory();
1011 return -1;
1012 }
1013 new_size = (length + 1) * char_size;
1014
Victor Stinner7a9105a2011-12-12 00:13:42 +01001015 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016 {
1017 PyObject_DEL(_PyUnicode_UTF8(unicode));
1018 _PyUnicode_UTF8(unicode) = NULL;
1019 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020 }
1021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 data = (PyObject *)PyObject_REALLOC(data, new_size);
1023 if (data == NULL) {
1024 PyErr_NoMemory();
1025 return -1;
1026 }
1027 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001028 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001030 _PyUnicode_WSTR_LENGTH(unicode) = length;
1031 }
1032 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001033 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001034 _PyUnicode_UTF8_LENGTH(unicode) = length;
1035 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 _PyUnicode_LENGTH(unicode) = length;
1037 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 unicode_fill_invalid(unicode, old_length);
1040#endif
Victor Stinner95663112011-10-04 01:03:50 +02001041 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001042 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 }
Victor Stinner95663112011-10-04 01:03:50 +02001046 assert(_PyUnicode_WSTR(unicode) != NULL);
1047
1048 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001049 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001050 PyErr_NoMemory();
1051 return -1;
1052 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001053 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001054 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001055 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001056 if (!wstr) {
1057 PyErr_NoMemory();
1058 return -1;
1059 }
1060 _PyUnicode_WSTR(unicode) = wstr;
1061 _PyUnicode_WSTR(unicode)[length] = 0;
1062 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001063 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 return 0;
1065}
1066
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067static PyObject*
1068resize_copy(PyObject *unicode, Py_ssize_t length)
1069{
1070 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001071 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001073
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001074 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001075
1076 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077 if (copy == NULL)
1078 return NULL;
1079
1080 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001083 }
1084 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001085 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001086
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001087 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 if (w == NULL)
1089 return NULL;
1090 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001092 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001093 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001094 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 }
1096}
1097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001099 Ux0000 terminated; some code (e.g. new_identifier)
1100 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001103 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
1105*/
1106
Alexander Belopolsky40018472011-02-26 01:02:56 +00001107static PyUnicodeObject *
1108_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001110 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112
Thomas Wouters477c8d52006-05-27 19:21:47 +00001113 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 if (length == 0 && unicode_empty != NULL) {
1115 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001116 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 }
1118
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001119 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001120 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001121 return (PyUnicodeObject *)PyErr_NoMemory();
1122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 if (length < 0) {
1124 PyErr_SetString(PyExc_SystemError,
1125 "Negative size passed to _PyUnicode_New");
1126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 }
1128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130 if (unicode == NULL)
1131 return NULL;
1132 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001133
1134 _PyUnicode_WSTR_LENGTH(unicode) = length;
1135 _PyUnicode_HASH(unicode) = -1;
1136 _PyUnicode_STATE(unicode).interned = 0;
1137 _PyUnicode_STATE(unicode).kind = 0;
1138 _PyUnicode_STATE(unicode).compact = 0;
1139 _PyUnicode_STATE(unicode).ready = 0;
1140 _PyUnicode_STATE(unicode).ascii = 0;
1141 _PyUnicode_DATA_ANY(unicode) = NULL;
1142 _PyUnicode_LENGTH(unicode) = 0;
1143 _PyUnicode_UTF8(unicode) = NULL;
1144 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001148 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001150 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Jeremy Hyltond8082792003-09-16 19:41:39 +00001153 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001154 * the caller fails before initializing str -- unicode_resize()
1155 * reads str[0], and the Keep-Alive optimization can keep memory
1156 * allocated for str alive across a call to unicode_dealloc(unicode).
1157 * We don't want unicode_resize to read uninitialized memory in
1158 * that case.
1159 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 _PyUnicode_WSTR(unicode)[0] = 0;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001162
Victor Stinner7931d9a2011-11-04 00:22:48 +01001163 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 return unicode;
1165}
1166
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167static const char*
1168unicode_kind_name(PyObject *unicode)
1169{
Victor Stinner42dfd712011-10-03 14:41:45 +02001170 /* don't check consistency: unicode_kind_name() is called from
1171 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 if (!PyUnicode_IS_COMPACT(unicode))
1173 {
1174 if (!PyUnicode_IS_READY(unicode))
1175 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001176 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001177 {
1178 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001179 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001180 return "legacy ascii";
1181 else
1182 return "legacy latin1";
1183 case PyUnicode_2BYTE_KIND:
1184 return "legacy UCS2";
1185 case PyUnicode_4BYTE_KIND:
1186 return "legacy UCS4";
1187 default:
1188 return "<legacy invalid kind>";
1189 }
1190 }
1191 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001192 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001193 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001194 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001195 return "ascii";
1196 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001197 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001199 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001202 default:
1203 return "<invalid compact kind>";
1204 }
1205}
1206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208/* Functions wrapping macros for use in debugger */
1209char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001210 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211}
1212
1213void *_PyUnicode_compact_data(void *unicode) {
1214 return _PyUnicode_COMPACT_DATA(unicode);
1215}
1216void *_PyUnicode_data(void *unicode){
1217 printf("obj %p\n", unicode);
1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223 return PyUnicode_DATA(unicode);
1224}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001225
1226void
1227_PyUnicode_Dump(PyObject *op)
1228{
1229 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001233
Victor Stinnera849a4b2011-10-03 12:12:11 +02001234 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001235 {
1236 if (ascii->state.ascii)
1237 data = (ascii + 1);
1238 else
1239 data = (compact + 1);
1240 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001241 else
1242 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001245
Victor Stinnera849a4b2011-10-03 12:12:11 +02001246 if (ascii->wstr == data)
1247 printf("shared ");
1248 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001249
Victor Stinnera3b334d2011-10-03 13:53:37 +02001250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001252 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001256 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001257 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001258}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259#endif
1260
1261PyObject *
1262PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263{
1264 PyObject *obj;
1265 PyCompactUnicodeObject *unicode;
1266 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001267 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001268 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 Py_ssize_t char_size;
1270 Py_ssize_t struct_size;
1271
1272 /* Optimization for empty strings */
1273 if (size == 0 && unicode_empty != NULL) {
1274 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001275 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 }
1277
Victor Stinner9e9d6892011-10-04 01:02:02 +02001278 is_ascii = 0;
1279 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 struct_size = sizeof(PyCompactUnicodeObject);
1281 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001282 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 char_size = 1;
1284 is_ascii = 1;
1285 struct_size = sizeof(PyASCIIObject);
1286 }
1287 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001288 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 char_size = 1;
1290 }
1291 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001292 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 char_size = 2;
1294 if (sizeof(wchar_t) == 2)
1295 is_sharing = 1;
1296 }
1297 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001298 if (maxchar > MAX_UNICODE) {
1299 PyErr_SetString(PyExc_SystemError,
1300 "invalid maximum character passed to PyUnicode_New");
1301 return NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 char_size = 4;
1305 if (sizeof(wchar_t) == 4)
1306 is_sharing = 1;
1307 }
1308
1309 /* Ensure we won't overflow the size. */
1310 if (size < 0) {
1311 PyErr_SetString(PyExc_SystemError,
1312 "Negative size passed to PyUnicode_New");
1313 return NULL;
1314 }
1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316 return PyErr_NoMemory();
1317
1318 /* Duplicated allocation code from _PyObject_New() instead of a call to
1319 * PyObject_New() so we are able to allocate space for the object and
1320 * it's data buffer.
1321 */
1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323 if (obj == NULL)
1324 return PyErr_NoMemory();
1325 obj = PyObject_INIT(obj, &PyUnicode_Type);
1326 if (obj == NULL)
1327 return NULL;
1328
1329 unicode = (PyCompactUnicodeObject *)obj;
1330 if (is_ascii)
1331 data = ((PyASCIIObject*)obj) + 1;
1332 else
1333 data = unicode + 1;
1334 _PyUnicode_LENGTH(unicode) = size;
1335 _PyUnicode_HASH(unicode) = -1;
1336 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001337 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 _PyUnicode_STATE(unicode).compact = 1;
1339 _PyUnicode_STATE(unicode).ready = 1;
1340 _PyUnicode_STATE(unicode).ascii = is_ascii;
1341 if (is_ascii) {
1342 ((char*)data)[size] = 0;
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 }
Victor Stinner8f825062012-04-27 13:55:39 +02001345 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 ((char*)data)[size] = 0;
1347 _PyUnicode_WSTR(unicode) = NULL;
1348 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001350 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 else {
1353 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001354 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001355 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001357 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 ((Py_UCS4*)data)[size] = 0;
1359 if (is_sharing) {
1360 _PyUnicode_WSTR_LENGTH(unicode) = size;
1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362 }
1363 else {
1364 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 }
1367 }
Victor Stinner8f825062012-04-27 13:55:39 +02001368#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001369 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001370#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 return obj;
1373}
1374
1375#if SIZEOF_WCHAR_T == 2
1376/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001378 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379
1380 This function assumes that unicode can hold one more code point than wstr
1381 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001382static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001384 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385{
1386 const wchar_t *iter;
1387 Py_UCS4 *ucs4_out;
1388
Victor Stinner910337b2011-10-03 03:20:16 +02001389 assert(unicode != NULL);
1390 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393
1394 for (iter = begin; iter < end; ) {
1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398 && (iter+1) < end
1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 {
Victor Stinner551ac952011-11-29 22:58:13 +01001401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 iter += 2;
1403 }
1404 else {
1405 *ucs4_out++ = *iter;
1406 iter++;
1407 }
1408 }
1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410 _PyUnicode_GET_LENGTH(unicode)));
1411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412}
1413#endif
1414
Victor Stinnercd9950f2011-10-02 00:34:53 +02001415static int
Victor Stinner488fa492011-12-12 00:01:39 +01001416unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001417{
Victor Stinner488fa492011-12-12 00:01:39 +01001418 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001419 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001420 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001421 return -1;
1422 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001423 return 0;
1424}
1425
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001426static int
1427_copy_characters(PyObject *to, Py_ssize_t to_start,
1428 PyObject *from, Py_ssize_t from_start,
1429 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001431 unsigned int from_kind, to_kind;
1432 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433
Victor Stinneree4544c2012-05-09 22:24:08 +02001434 assert(0 <= how_many);
1435 assert(0 <= from_start);
1436 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001437 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001438 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440
Victor Stinnerd3f08822012-05-29 12:57:52 +02001441 assert(PyUnicode_Check(to));
1442 assert(PyUnicode_IS_READY(to));
1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 if (how_many == 0)
1446 return 0;
1447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001449 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001451 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
Victor Stinnerf1852262012-06-16 16:38:26 +02001453#ifdef Py_DEBUG
1454 if (!check_maxchar
1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456 {
1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458 Py_UCS4 ch;
1459 Py_ssize_t i;
1460 for (i=0; i < how_many; i++) {
1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462 assert(ch <= to_maxchar);
1463 }
1464 }
1465#endif
1466
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001467 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001468 if (check_maxchar
1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001471 /* Writing Latin-1 characters into an ASCII string requires to
1472 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001473 Py_UCS4 max_char;
1474 max_char = ucs1lib_find_max_char(from_data,
1475 (Py_UCS1*)from_data + how_many);
1476 if (max_char >= 128)
1477 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001478 }
Christian Heimesf051e432016-09-13 20:22:02 +02001479 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001480 (char*)from_data + from_kind * from_start,
1481 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001483 else if (from_kind == PyUnicode_1BYTE_KIND
1484 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001485 {
1486 _PyUnicode_CONVERT_BYTES(
1487 Py_UCS1, Py_UCS2,
1488 PyUnicode_1BYTE_DATA(from) + from_start,
1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490 PyUnicode_2BYTE_DATA(to) + to_start
1491 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001492 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001493 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001494 && to_kind == PyUnicode_4BYTE_KIND)
1495 {
1496 _PyUnicode_CONVERT_BYTES(
1497 Py_UCS1, Py_UCS4,
1498 PyUnicode_1BYTE_DATA(from) + from_start,
1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500 PyUnicode_4BYTE_DATA(to) + to_start
1501 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001502 }
1503 else if (from_kind == PyUnicode_2BYTE_KIND
1504 && to_kind == PyUnicode_4BYTE_KIND)
1505 {
1506 _PyUnicode_CONVERT_BYTES(
1507 Py_UCS2, Py_UCS4,
1508 PyUnicode_2BYTE_DATA(from) + from_start,
1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510 PyUnicode_4BYTE_DATA(to) + to_start
1511 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001512 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001516 if (!check_maxchar) {
1517 if (from_kind == PyUnicode_2BYTE_KIND
1518 && to_kind == PyUnicode_1BYTE_KIND)
1519 {
1520 _PyUnicode_CONVERT_BYTES(
1521 Py_UCS2, Py_UCS1,
1522 PyUnicode_2BYTE_DATA(from) + from_start,
1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524 PyUnicode_1BYTE_DATA(to) + to_start
1525 );
1526 }
1527 else if (from_kind == PyUnicode_4BYTE_KIND
1528 && to_kind == PyUnicode_1BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS4, Py_UCS1,
1532 PyUnicode_4BYTE_DATA(from) + from_start,
1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_1BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_4BYTE_KIND
1538 && to_kind == PyUnicode_2BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS4, Py_UCS2,
1542 PyUnicode_4BYTE_DATA(from) + from_start,
1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_2BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001548 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001549 }
1550 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001551 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001554 Py_ssize_t i;
1555
Victor Stinnera0702ab2011-09-29 14:14:38 +02001556 for (i=0; i < how_many; i++) {
1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001558 if (ch > to_maxchar)
1559 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001562 }
1563 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return 0;
1565}
1566
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567void
1568_PyUnicode_FastCopyCharacters(
1569 PyObject *to, Py_ssize_t to_start,
1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571{
1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573}
1574
1575Py_ssize_t
1576PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577 PyObject *from, Py_ssize_t from_start,
1578 Py_ssize_t how_many)
1579{
1580 int err;
1581
1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586
Benjamin Petersonbac79492012-01-14 13:34:47 -05001587 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001588 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001589 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 return -1;
1591
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001593 PyErr_SetString(PyExc_IndexError, "string index out of range");
1594 return -1;
1595 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001597 PyErr_SetString(PyExc_IndexError, "string index out of range");
1598 return -1;
1599 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001600 if (how_many < 0) {
1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602 return -1;
1603 }
1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001607 "Cannot write %zi characters at %zi "
1608 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001609 how_many, to_start, PyUnicode_GET_LENGTH(to));
1610 return -1;
1611 }
1612
1613 if (how_many == 0)
1614 return 0;
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001617 return -1;
1618
1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620 if (err) {
1621 PyErr_Format(PyExc_SystemError,
1622 "Cannot copy %s characters "
1623 "into a string of %s characters",
1624 unicode_kind_name(from),
1625 unicode_kind_name(to));
1626 return -1;
1627 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001628 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629}
1630
Victor Stinner17222162011-09-28 22:15:37 +02001631/* Find the maximum code point and count the number of surrogate pairs so a
1632 correct string length can be computed before converting a string to UCS4.
1633 This function counts single surrogates as a character and not as a pair.
1634
1635 Return 0 on success, or -1 on error. */
1636static int
1637find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639{
1640 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001641 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642
Victor Stinnerc53be962011-10-02 21:33:54 +02001643 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 *num_surrogates = 0;
1645 *maxchar = 0;
1646
1647 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650 && (iter+1) < end
1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652 {
1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654 ++(*num_surrogates);
1655 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 }
1657 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001659 {
1660 ch = *iter;
1661 iter++;
1662 }
1663 if (ch > *maxchar) {
1664 *maxchar = ch;
1665 if (*maxchar > MAX_UNICODE) {
1666 PyErr_Format(PyExc_ValueError,
1667 "character U+%x is not in range [U+0000; U+10ffff]",
1668 ch);
1669 return -1;
1670 }
1671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 }
1673 return 0;
1674}
1675
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001676int
1677_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678{
1679 wchar_t *end;
1680 Py_UCS4 maxchar = 0;
1681 Py_ssize_t num_surrogates;
1682#if SIZEOF_WCHAR_T == 2
1683 Py_ssize_t length_wo_surrogates;
1684#endif
1685
Georg Brandl7597add2011-10-05 16:36:47 +02001686 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001687 strings were created using _PyObject_New() and where no canonical
1688 representation (the str field) has been set yet aka strings
1689 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001690 assert(_PyUnicode_CHECK(unicode));
1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001693 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001694 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001695 /* Actually, it should neither be interned nor be anything else: */
1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001700 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702
1703 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_1BYTE_DATA(unicode));
1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001716 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 }
1720 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001721 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 }
1725 PyObject_FREE(_PyUnicode_WSTR(unicode));
1726 _PyUnicode_WSTR(unicode) = NULL;
1727 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728 }
1729 /* In this case we might have to convert down from 4-byte native
1730 wchar_t to 2-byte unicode. */
1731 else if (maxchar < 65536) {
1732 assert(num_surrogates == 0 &&
1733 "FindMaxCharAndNumSurrogatePairs() messed up");
1734
Victor Stinner506f5922011-09-28 22:34:18 +02001735#if SIZEOF_WCHAR_T == 2
1736 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001743#else
1744 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001748 PyErr_NoMemory();
1749 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 }
Victor Stinner506f5922011-09-28 22:34:18 +02001751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752 _PyUnicode_WSTR(unicode), end,
1753 PyUnicode_2BYTE_DATA(unicode));
1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001759 PyObject_FREE(_PyUnicode_WSTR(unicode));
1760 _PyUnicode_WSTR(unicode) = NULL;
1761 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765 else {
1766#if SIZEOF_WCHAR_T == 2
1767 /* in case the native representation is 2-bytes, we need to allocate a
1768 new normalized 4-byte version. */
1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771 PyErr_NoMemory();
1772 return -1;
1773 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 PyErr_NoMemory();
1777 return -1;
1778 }
1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001781 _PyUnicode_UTF8(unicode) = NULL;
1782 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001783 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 PyObject_FREE(_PyUnicode_WSTR(unicode));
1787 _PyUnicode_WSTR(unicode) = NULL;
1788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789#else
1790 assert(num_surrogates == 0);
1791
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797#endif
1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799 }
1800 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001801 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 return 0;
1803}
1804
Alexander Belopolsky40018472011-02-26 01:02:56 +00001805static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001806unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807{
Walter Dörwald16807132007-05-25 13:52:07 +00001808 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001809 case SSTATE_NOT_INTERNED:
1810 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001811
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 case SSTATE_INTERNED_MORTAL:
1813 /* revive dead object temporarily for DelItem */
1814 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001815 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001816 Py_FatalError(
1817 "deletion of interned string failed");
1818 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001819
Benjamin Peterson29060642009-01-31 22:14:21 +00001820 case SSTATE_INTERNED_IMMORTAL:
1821 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001822 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001823
Benjamin Peterson29060642009-01-31 22:14:21 +00001824 default:
1825 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001826 }
1827
Victor Stinner03490912011-10-03 23:45:12 +02001828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001835 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836}
1837
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001838#ifdef Py_DEBUG
1839static int
1840unicode_is_singleton(PyObject *unicode)
1841{
1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843 if (unicode == unicode_empty)
1844 return 1;
1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846 {
1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848 if (ch < 256 && unicode_latin1[ch] == unicode)
1849 return 1;
1850 }
1851 return 0;
1852}
1853#endif
1854
Alexander Belopolsky40018472011-02-26 01:02:56 +00001855static int
Victor Stinner488fa492011-12-12 00:01:39 +01001856unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001857{
Victor Stinner488fa492011-12-12 00:01:39 +01001858 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001859 if (Py_REFCNT(unicode) != 1)
1860 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001861 if (_PyUnicode_HASH(unicode) != -1)
1862 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001863 if (PyUnicode_CHECK_INTERNED(unicode))
1864 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001865 if (!PyUnicode_CheckExact(unicode))
1866 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001867#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001868 /* singleton refcount is greater than 1 */
1869 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001870#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001871 return 1;
1872}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001873
Victor Stinnerfe226c02011-10-03 03:52:20 +02001874static int
1875unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876{
1877 PyObject *unicode;
1878 Py_ssize_t old_length;
1879
1880 assert(p_unicode != NULL);
1881 unicode = *p_unicode;
1882
1883 assert(unicode != NULL);
1884 assert(PyUnicode_Check(unicode));
1885 assert(0 <= length);
1886
Victor Stinner910337b2011-10-03 03:20:16 +02001887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001888 old_length = PyUnicode_WSTR_LENGTH(unicode);
1889 else
1890 old_length = PyUnicode_GET_LENGTH(unicode);
1891 if (old_length == length)
1892 return 0;
1893
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001894 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001895 _Py_INCREF_UNICODE_EMPTY();
1896 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001898 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001899 return 0;
1900 }
1901
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001903 PyObject *copy = resize_copy(unicode, length);
1904 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001906 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001908 }
1909
Victor Stinnerfe226c02011-10-03 03:52:20 +02001910 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001911 PyObject *new_unicode = resize_compact(unicode, length);
1912 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001914 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001915 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001916 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001917 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001918}
1919
Alexander Belopolsky40018472011-02-26 01:02:56 +00001920int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001921PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001922{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001923 PyObject *unicode;
1924 if (p_unicode == NULL) {
1925 PyErr_BadInternalCall();
1926 return -1;
1927 }
1928 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001930 {
1931 PyErr_BadInternalCall();
1932 return -1;
1933 }
1934 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001935}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001936
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001937/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001938
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001939 WARNING: The function doesn't copy the terminating null character and
1940 doesn't check the maximum character (may write a latin1 character in an
1941 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001942static void
1943unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001945{
1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001948 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001949
1950 switch (kind) {
1951 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001952 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001953#ifdef Py_DEBUG
1954 if (PyUnicode_IS_ASCII(unicode)) {
1955 Py_UCS4 maxchar = ucs1lib_find_max_char(
1956 (const Py_UCS1*)str,
1957 (const Py_UCS1*)str + len);
1958 assert(maxchar < 128);
1959 }
1960#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001961 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001962 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001963 }
1964 case PyUnicode_2BYTE_KIND: {
1965 Py_UCS2 *start = (Py_UCS2 *)data + index;
1966 Py_UCS2 *ucs2 = start;
1967 assert(index <= PyUnicode_GET_LENGTH(unicode));
1968
Victor Stinner184252a2012-06-16 02:57:41 +02001969 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001970 *ucs2 = (Py_UCS2)*str;
1971
1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001973 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001974 }
1975 default: {
1976 Py_UCS4 *start = (Py_UCS4 *)data + index;
1977 Py_UCS4 *ucs4 = start;
1978 assert(kind == PyUnicode_4BYTE_KIND);
1979 assert(index <= PyUnicode_GET_LENGTH(unicode));
1980
Victor Stinner184252a2012-06-16 02:57:41 +02001981 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982 *ucs4 = (Py_UCS4)*str;
1983
1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001985 }
1986 }
1987}
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989static PyObject*
1990get_latin1_char(unsigned char ch)
1991{
Victor Stinnera464fc12011-10-02 20:39:30 +02001992 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001994 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 if (!unicode)
1996 return NULL;
1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 unicode_latin1[ch] = unicode;
2000 }
2001 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002002 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003}
2004
Victor Stinner985a82a2014-01-03 12:53:47 +01002005static PyObject*
2006unicode_char(Py_UCS4 ch)
2007{
2008 PyObject *unicode;
2009
2010 assert(ch <= MAX_UNICODE);
2011
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002012 if (ch < 256)
2013 return get_latin1_char(ch);
2014
Victor Stinner985a82a2014-01-03 12:53:47 +01002015 unicode = PyUnicode_New(1, ch);
2016 if (unicode == NULL)
2017 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002018
2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002022 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025 }
2026 assert(_PyUnicode_CheckConsistency(unicode, 1));
2027 return unicode;
2028}
2029
Alexander Belopolsky40018472011-02-26 01:02:56 +00002030PyObject *
2031PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002033 if (u == NULL)
2034 return (PyObject*)_PyUnicode_New(size);
2035
2036 if (size < 0) {
2037 PyErr_BadInternalCall();
2038 return NULL;
2039 }
2040
2041 return PyUnicode_FromWideChar(u, size);
2042}
2043
2044PyObject *
2045PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002047 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 Py_UCS4 maxchar = 0;
2049 Py_ssize_t num_surrogates;
2050
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002051 if (u == NULL && size != 0) {
2052 PyErr_BadInternalCall();
2053 return NULL;
2054 }
2055
2056 if (size == -1) {
2057 size = wcslen(u);
2058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002060 /* If the Unicode data is known at construction time, we can apply
2061 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002064 if (size == 0)
2065 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 /* Single character Unicode objects in the Latin-1 range are
2068 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002069 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 return get_latin1_char((unsigned char)*u);
2071
2072 /* If not empty and not single character, copy the Unicode data
2073 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002074 if (find_maxchar_surrogates(u, u + size,
2075 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 return NULL;
2077
Victor Stinner8faf8212011-12-08 22:14:11 +01002078 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 if (!unicode)
2080 return NULL;
2081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 switch (PyUnicode_KIND(unicode)) {
2083 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086 break;
2087 case PyUnicode_2BYTE_KIND:
2088#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093#endif
2094 break;
2095 case PyUnicode_4BYTE_KIND:
2096#if SIZEOF_WCHAR_T == 2
2097 /* This is the only case which has to process surrogates, thus
2098 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002099 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100#else
2101 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103#endif
2104 break;
2105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002106 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002109 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110}
2111
Alexander Belopolsky40018472011-02-26 01:02:56 +00002112PyObject *
2113PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002114{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002115 if (size < 0) {
2116 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002117 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 return NULL;
2119 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002120 if (u != NULL)
2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122 else
2123 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002128{
2129 size_t size = strlen(u);
2130 if (size > PY_SSIZE_T_MAX) {
2131 PyErr_SetString(PyExc_OverflowError, "input too long");
2132 return NULL;
2133 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002135}
2136
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137PyObject *
2138_PyUnicode_FromId(_Py_Identifier *id)
2139{
2140 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002141 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142 strlen(id->string),
2143 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002144 if (!id->object)
2145 return NULL;
2146 PyUnicode_InternInPlace(&id->object);
2147 assert(!id->next);
2148 id->next = static_strings;
2149 static_strings = id;
2150 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002151 return id->object;
2152}
2153
2154void
2155_PyUnicode_ClearStaticStrings()
2156{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002157 _Py_Identifier *tmp, *s = static_strings;
2158 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002159 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002160 tmp = s->next;
2161 s->next = NULL;
2162 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002163 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002164 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002165}
2166
Benjamin Peterson0df54292012-03-26 14:50:32 -04002167/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002168
Victor Stinnerd3f08822012-05-29 12:57:52 +02002169PyObject*
2170_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002171{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002172 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002173 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002174 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002175#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002176 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002177#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002178 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002179 }
Victor Stinner785938e2011-12-11 20:09:03 +01002180 unicode = PyUnicode_New(size, 127);
2181 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002182 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184 assert(_PyUnicode_CheckConsistency(unicode, 1));
2185 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002186}
2187
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002188static Py_UCS4
2189kind_maxchar_limit(unsigned int kind)
2190{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002191 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002192 case PyUnicode_1BYTE_KIND:
2193 return 0x80;
2194 case PyUnicode_2BYTE_KIND:
2195 return 0x100;
2196 case PyUnicode_4BYTE_KIND:
2197 return 0x10000;
2198 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002199 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002200 }
2201}
2202
Victor Stinner702c7342011-10-05 13:50:52 +02002203static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002204_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002207 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002208
Serhiy Storchaka678db842013-01-26 12:16:36 +02002209 if (size == 0)
2210 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002212 if (size == 1)
2213 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002215 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002216 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (!res)
2218 return NULL;
2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002220 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002222}
2223
Victor Stinnere57b1c02011-09-28 22:20:48 +02002224static PyObject*
2225_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226{
2227 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002228 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229
Serhiy Storchaka678db842013-01-26 12:16:36 +02002230 if (size == 0)
2231 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002232 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002233 if (size == 1)
2234 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002235
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002236 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002237 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 if (!res)
2239 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002240 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002242 else {
2243 _PyUnicode_CONVERT_BYTES(
2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002246 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return res;
2248}
2249
Victor Stinnere57b1c02011-09-28 22:20:48 +02002250static PyObject*
2251_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252{
2253 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002254 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255
Serhiy Storchaka678db842013-01-26 12:16:36 +02002256 if (size == 0)
2257 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002258 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002259 if (size == 1)
2260 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002261
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002262 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002263 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 if (!res)
2265 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002266 if (max_char < 256)
2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268 PyUnicode_1BYTE_DATA(res));
2269 else if (max_char < 0x10000)
2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271 PyUnicode_2BYTE_DATA(res));
2272 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002274 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return res;
2276}
2277
2278PyObject*
2279PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002281 if (size < 0) {
2282 PyErr_SetString(PyExc_ValueError, "size must be positive");
2283 return NULL;
2284 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002285 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002289 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002291 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002293 PyErr_SetString(PyExc_SystemError, "invalid kind");
2294 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296}
2297
Victor Stinnerece58de2012-04-23 23:36:38 +02002298Py_UCS4
2299_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300{
2301 enum PyUnicode_Kind kind;
2302 void *startptr, *endptr;
2303
2304 assert(PyUnicode_IS_READY(unicode));
2305 assert(0 <= start);
2306 assert(end <= PyUnicode_GET_LENGTH(unicode));
2307 assert(start <= end);
2308
2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310 return PyUnicode_MAX_CHAR_VALUE(unicode);
2311
2312 if (start == end)
2313 return 127;
2314
Victor Stinner94d558b2012-04-27 22:26:58 +02002315 if (PyUnicode_IS_ASCII(unicode))
2316 return 127;
2317
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002319 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002320 endptr = (char *)startptr + end * kind;
2321 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002322 switch(kind) {
2323 case PyUnicode_1BYTE_KIND:
2324 return ucs1lib_find_max_char(startptr, endptr);
2325 case PyUnicode_2BYTE_KIND:
2326 return ucs2lib_find_max_char(startptr, endptr);
2327 case PyUnicode_4BYTE_KIND:
2328 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002330 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002331 }
2332}
2333
Victor Stinner25a4b292011-10-06 12:31:55 +02002334/* Ensure that a string uses the most efficient storage, if it is not the
2335 case: create a new string with of the right kind. Write NULL into *p_unicode
2336 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002337static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002338unicode_adjust_maxchar(PyObject **p_unicode)
2339{
2340 PyObject *unicode, *copy;
2341 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002342 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002343 unsigned int kind;
2344
2345 assert(p_unicode != NULL);
2346 unicode = *p_unicode;
2347 assert(PyUnicode_IS_READY(unicode));
2348 if (PyUnicode_IS_ASCII(unicode))
2349 return;
2350
2351 len = PyUnicode_GET_LENGTH(unicode);
2352 kind = PyUnicode_KIND(unicode);
2353 if (kind == PyUnicode_1BYTE_KIND) {
2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 max_char = ucs1lib_find_max_char(u, u + len);
2356 if (max_char >= 128)
2357 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 }
2359 else if (kind == PyUnicode_2BYTE_KIND) {
2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002361 max_char = ucs2lib_find_max_char(u, u + len);
2362 if (max_char >= 256)
2363 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002364 }
2365 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002367 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002368 max_char = ucs4lib_find_max_char(u, u + len);
2369 if (max_char >= 0x10000)
2370 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002371 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002372 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002373 if (copy != NULL)
2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002375 Py_DECREF(unicode);
2376 *p_unicode = copy;
2377}
2378
Victor Stinner034f6cf2011-09-30 02:26:44 +02002379PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002380_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002381{
Victor Stinner87af4f22011-11-21 23:03:47 +01002382 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002383 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002384
Victor Stinner034f6cf2011-09-30 02:26:44 +02002385 if (!PyUnicode_Check(unicode)) {
2386 PyErr_BadInternalCall();
2387 return NULL;
2388 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002389 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002390 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002391
Victor Stinner87af4f22011-11-21 23:03:47 +01002392 length = PyUnicode_GET_LENGTH(unicode);
2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002394 if (!copy)
2395 return NULL;
2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397
Christian Heimesf051e432016-09-13 20:22:02 +02002398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002399 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002400 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002401 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002402}
2403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404
Victor Stinnerbc603d12011-10-02 01:00:40 +02002405/* Widen Unicode objects to larger buffers. Don't write terminating null
2406 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407
2408void*
2409_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002411 Py_ssize_t len;
2412 void *result;
2413 unsigned int skind;
2414
Benjamin Petersonbac79492012-01-14 13:34:47 -05002415 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002416 return NULL;
2417
2418 len = PyUnicode_GET_LENGTH(s);
2419 skind = PyUnicode_KIND(s);
2420 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return NULL;
2423 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002424 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002426 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002427 if (!result)
2428 return PyErr_NoMemory();
2429 assert(skind == PyUnicode_1BYTE_KIND);
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS1, Py_UCS2,
2432 PyUnicode_1BYTE_DATA(s),
2433 PyUnicode_1BYTE_DATA(s) + len,
2434 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002436 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002437 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002438 if (!result)
2439 return PyErr_NoMemory();
2440 if (skind == PyUnicode_2BYTE_KIND) {
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS2, Py_UCS4,
2443 PyUnicode_2BYTE_DATA(s),
2444 PyUnicode_2BYTE_DATA(s) + len,
2445 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002447 else {
2448 assert(skind == PyUnicode_1BYTE_KIND);
2449 _PyUnicode_CONVERT_BYTES(
2450 Py_UCS1, Py_UCS4,
2451 PyUnicode_1BYTE_DATA(s),
2452 PyUnicode_1BYTE_DATA(s) + len,
2453 result);
2454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002456 default:
2457 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 }
Victor Stinner01698042011-10-04 00:04:26 +02002459 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 return NULL;
2461}
2462
2463static Py_UCS4*
2464as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
2467 int kind;
2468 void *data;
2469 Py_ssize_t len, targetlen;
2470 if (PyUnicode_READY(string) == -1)
2471 return NULL;
2472 kind = PyUnicode_KIND(string);
2473 data = PyUnicode_DATA(string);
2474 len = PyUnicode_GET_LENGTH(string);
2475 targetlen = len;
2476 if (copy_null)
2477 targetlen++;
2478 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002479 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 if (!target) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
2484 }
2485 else {
2486 if (targetsize < targetlen) {
2487 PyErr_Format(PyExc_SystemError,
2488 "string is longer than the buffer");
2489 if (copy_null && 0 < targetsize)
2490 target[0] = 0;
2491 return NULL;
2492 }
2493 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002494 if (kind == PyUnicode_1BYTE_KIND) {
2495 Py_UCS1 *start = (Py_UCS1 *) data;
2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002498 else if (kind == PyUnicode_2BYTE_KIND) {
2499 Py_UCS2 *start = (Py_UCS2 *) data;
2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501 }
2502 else {
2503 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002504 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (copy_null)
2507 target[len] = 0;
2508 return target;
2509}
2510
2511Py_UCS4*
2512PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513 int copy_null)
2514{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002515 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 PyErr_BadInternalCall();
2517 return NULL;
2518 }
2519 return as_ucs4(string, target, targetsize, copy_null);
2520}
2521
2522Py_UCS4*
2523PyUnicode_AsUCS4Copy(PyObject *string)
2524{
2525 return as_ucs4(string, NULL, 0, 1);
2526}
2527
Victor Stinner15a11362012-10-06 23:48:20 +02002528/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2531#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002532
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533static int
2534unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535 Py_ssize_t width, Py_ssize_t precision)
2536{
2537 Py_ssize_t length, fill, arglen;
2538 Py_UCS4 maxchar;
2539
2540 if (PyUnicode_READY(str) == -1)
2541 return -1;
2542
2543 length = PyUnicode_GET_LENGTH(str);
2544 if ((precision == -1 || precision >= length)
2545 && width <= length)
2546 return _PyUnicodeWriter_WriteStr(writer, str);
2547
2548 if (precision != -1)
2549 length = Py_MIN(precision, length);
2550
2551 arglen = Py_MAX(length, width);
2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554 else
2555 maxchar = writer->maxchar;
2556
2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558 return -1;
2559
2560 if (width > length) {
2561 fill = width - length;
2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563 return -1;
2564 writer->pos += fill;
2565 }
2566
2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568 str, 0, length);
2569 writer->pos += length;
2570 return 0;
2571}
2572
2573static int
2574unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575 Py_ssize_t width, Py_ssize_t precision)
2576{
2577 /* UTF-8 */
2578 Py_ssize_t length;
2579 PyObject *unicode;
2580 int res;
2581
2582 length = strlen(str);
2583 if (precision != -1)
2584 length = Py_MIN(length, precision);
2585 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2586 if (unicode == NULL)
2587 return -1;
2588
2589 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2590 Py_DECREF(unicode);
2591 return res;
2592}
2593
Victor Stinner96865452011-03-01 23:44:09 +00002594static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002595unicode_fromformat_arg(_PyUnicodeWriter *writer,
2596 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002597{
Victor Stinnere215d962012-10-06 23:03:36 +02002598 const char *p;
2599 Py_ssize_t len;
2600 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002601 Py_ssize_t width;
2602 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002603 int longflag;
2604 int longlongflag;
2605 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002607
2608 p = f;
2609 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002610 zeropad = 0;
2611 if (*f == '0') {
2612 zeropad = 1;
2613 f++;
2614 }
Victor Stinner96865452011-03-01 23:44:09 +00002615
2616 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 width = -1;
2618 if (Py_ISDIGIT((unsigned)*f)) {
2619 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002620 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002622 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002623 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002624 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002625 return NULL;
2626 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002628 f++;
2629 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 }
2631 precision = -1;
2632 if (*f == '.') {
2633 f++;
2634 if (Py_ISDIGIT((unsigned)*f)) {
2635 precision = (*f - '0');
2636 f++;
2637 while (Py_ISDIGIT((unsigned)*f)) {
2638 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2639 PyErr_SetString(PyExc_ValueError,
2640 "precision too big");
2641 return NULL;
2642 }
2643 precision = (precision * 10) + (*f - '0');
2644 f++;
2645 }
2646 }
Victor Stinner96865452011-03-01 23:44:09 +00002647 if (*f == '%') {
2648 /* "%.3%s" => f points to "3" */
2649 f--;
2650 }
2651 }
2652 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002653 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002654 f--;
2655 }
Victor Stinner96865452011-03-01 23:44:09 +00002656
2657 /* Handle %ld, %lu, %lld and %llu. */
2658 longflag = 0;
2659 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002660 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002661 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002662 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002663 longflag = 1;
2664 ++f;
2665 }
Victor Stinner96865452011-03-01 23:44:09 +00002666 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002667 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002668 longlongflag = 1;
2669 f += 2;
2670 }
Victor Stinner96865452011-03-01 23:44:09 +00002671 }
2672 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002673 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002674 size_tflag = 1;
2675 ++f;
2676 }
Victor Stinnere215d962012-10-06 23:03:36 +02002677
2678 if (f[1] == '\0')
2679 writer->overallocate = 0;
2680
2681 switch (*f) {
2682 case 'c':
2683 {
2684 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002685 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002686 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002687 "character argument not in range(0x110000)");
2688 return NULL;
2689 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002690 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002691 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002692 break;
2693 }
2694
2695 case 'i':
2696 case 'd':
2697 case 'u':
2698 case 'x':
2699 {
2700 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002701 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002702 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002703
2704 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002705 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002707 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002708 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002709 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002710 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002711 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002712 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002713 va_arg(*vargs, size_t));
2714 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002715 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002716 va_arg(*vargs, unsigned int));
2717 }
2718 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002719 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002720 }
2721 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002722 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002723 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002725 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002726 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002727 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002728 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002729 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002730 va_arg(*vargs, Py_ssize_t));
2731 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002732 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002733 va_arg(*vargs, int));
2734 }
2735 assert(len >= 0);
2736
Victor Stinnere215d962012-10-06 23:03:36 +02002737 if (precision < len)
2738 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002739
2740 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002741 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2742 return NULL;
2743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 if (width > precision) {
2745 Py_UCS4 fillchar;
2746 fill = width - precision;
2747 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002748 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2749 return NULL;
2750 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002751 }
Victor Stinner15a11362012-10-06 23:48:20 +02002752 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002753 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2755 return NULL;
2756 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002757 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002758
Victor Stinner4a587072013-11-19 12:54:53 +01002759 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2760 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002761 break;
2762 }
2763
2764 case 'p':
2765 {
2766 char number[MAX_LONG_LONG_CHARS];
2767
2768 len = sprintf(number, "%p", va_arg(*vargs, void*));
2769 assert(len >= 0);
2770
2771 /* %p is ill-defined: ensure leading 0x. */
2772 if (number[1] == 'X')
2773 number[1] = 'x';
2774 else if (number[1] != 'x') {
2775 memmove(number + 2, number,
2776 strlen(number) + 1);
2777 number[0] = '0';
2778 number[1] = 'x';
2779 len += 2;
2780 }
2781
Victor Stinner4a587072013-11-19 12:54:53 +01002782 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002783 return NULL;
2784 break;
2785 }
2786
2787 case 's':
2788 {
2789 /* UTF-8 */
2790 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002793 break;
2794 }
2795
2796 case 'U':
2797 {
2798 PyObject *obj = va_arg(*vargs, PyObject *);
2799 assert(obj && _PyUnicode_CHECK(obj));
2800
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002802 return NULL;
2803 break;
2804 }
2805
2806 case 'V':
2807 {
2808 PyObject *obj = va_arg(*vargs, PyObject *);
2809 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002810 if (obj) {
2811 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002813 return NULL;
2814 }
2815 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002816 assert(str != NULL);
2817 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002818 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002819 }
2820 break;
2821 }
2822
2823 case 'S':
2824 {
2825 PyObject *obj = va_arg(*vargs, PyObject *);
2826 PyObject *str;
2827 assert(obj);
2828 str = PyObject_Str(obj);
2829 if (!str)
2830 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002831 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002832 Py_DECREF(str);
2833 return NULL;
2834 }
2835 Py_DECREF(str);
2836 break;
2837 }
2838
2839 case 'R':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 PyObject *repr;
2843 assert(obj);
2844 repr = PyObject_Repr(obj);
2845 if (!repr)
2846 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002847 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002848 Py_DECREF(repr);
2849 return NULL;
2850 }
2851 Py_DECREF(repr);
2852 break;
2853 }
2854
2855 case 'A':
2856 {
2857 PyObject *obj = va_arg(*vargs, PyObject *);
2858 PyObject *ascii;
2859 assert(obj);
2860 ascii = PyObject_ASCII(obj);
2861 if (!ascii)
2862 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002863 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002864 Py_DECREF(ascii);
2865 return NULL;
2866 }
2867 Py_DECREF(ascii);
2868 break;
2869 }
2870
2871 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002872 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002873 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002874 break;
2875
2876 default:
2877 /* if we stumble upon an unknown formatting code, copy the rest
2878 of the format string to the output string. (we cannot just
2879 skip the code, since there's no way to know what's in the
2880 argument list) */
2881 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002882 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002883 return NULL;
2884 f = p+len;
2885 return f;
2886 }
2887
2888 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002889 return f;
2890}
2891
Walter Dörwaldd2034312007-05-18 16:29:38 +00002892PyObject *
2893PyUnicode_FromFormatV(const char *format, va_list vargs)
2894{
Victor Stinnere215d962012-10-06 23:03:36 +02002895 va_list vargs2;
2896 const char *f;
2897 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002898
Victor Stinner8f674cc2013-04-17 23:02:17 +02002899 _PyUnicodeWriter_Init(&writer);
2900 writer.min_length = strlen(format) + 100;
2901 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002902
Benjamin Peterson0c212142016-09-20 20:39:33 -07002903 // Copy varags to be able to pass a reference to a subfunction.
2904 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002905
2906 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002908 f = unicode_fromformat_arg(&writer, f, &vargs2);
2909 if (f == NULL)
2910 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 const char *p;
2914 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002915
Victor Stinnere215d962012-10-06 23:03:36 +02002916 p = f;
2917 do
2918 {
2919 if ((unsigned char)*p > 127) {
2920 PyErr_Format(PyExc_ValueError,
2921 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2922 "string, got a non-ASCII byte: 0x%02x",
2923 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002924 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002925 }
2926 p++;
2927 }
2928 while (*p != '\0' && *p != '%');
2929 len = p - f;
2930
2931 if (*p == '\0')
2932 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002933
2934 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002935 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002936
2937 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002940 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002941 return _PyUnicodeWriter_Finish(&writer);
2942
2943 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002944 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002945 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002946 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002947}
2948
Walter Dörwaldd2034312007-05-18 16:29:38 +00002949PyObject *
2950PyUnicode_FromFormat(const char *format, ...)
2951{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 PyObject* ret;
2953 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002954
2955#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002956 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002957#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002958 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002959#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002960 ret = PyUnicode_FromFormatV(format, vargs);
2961 va_end(vargs);
2962 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002963}
2964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965#ifdef HAVE_WCHAR_H
2966
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002967/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002968
Victor Stinnerd88d9832011-09-06 02:00:05 +02002969 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002970 character) required to convert the unicode object. Ignore size argument.
2971
Victor Stinnerd88d9832011-09-06 02:00:05 +02002972 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002973 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002974 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002975Py_ssize_t
2976PyUnicode_AsWideChar(PyObject *unicode,
2977 wchar_t *w,
2978 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002979{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002980 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 const wchar_t *wstr;
2982
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002983 if (unicode == NULL) {
2984 PyErr_BadInternalCall();
2985 return -1;
2986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002988 if (wstr == NULL)
2989 return -1;
2990
Victor Stinner5593d8a2010-10-02 11:11:27 +00002991 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002992 if (size > res)
2993 size = res + 1;
2994 else
2995 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002996 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002997 return res;
2998 }
2999 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003000 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00003001}
3002
Victor Stinner137c34c2010-09-29 10:25:54 +00003003wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003004PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003005 Py_ssize_t *size)
3006{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003007 const wchar_t *wstr;
3008 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003009 Py_ssize_t buflen;
3010
3011 if (unicode == NULL) {
3012 PyErr_BadInternalCall();
3013 return NULL;
3014 }
3015
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003016 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3017 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003018 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003019 }
3020 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3021 PyErr_SetString(PyExc_ValueError,
3022 "embedded null character");
3023 return NULL;
3024 }
3025
3026 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003027 if (buffer == NULL) {
3028 PyErr_NoMemory();
3029 return NULL;
3030 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003031 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003032 if (size != NULL)
3033 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003034 return buffer;
3035}
3036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038
Alexander Belopolsky40018472011-02-26 01:02:56 +00003039PyObject *
3040PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003041{
Victor Stinner8faf8212011-12-08 22:14:11 +01003042 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 PyErr_SetString(PyExc_ValueError,
3044 "chr() arg not in range(0x110000)");
3045 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003046 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003047
Victor Stinner985a82a2014-01-03 12:53:47 +01003048 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003052PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003054 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003057 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003058 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 Py_INCREF(obj);
3060 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003061 }
3062 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003063 /* For a Unicode subtype that's not a Unicode object,
3064 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003065 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003066 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003067 PyErr_Format(PyExc_TypeError,
3068 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003069 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003070 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003071}
3072
Alexander Belopolsky40018472011-02-26 01:02:56 +00003073PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003074PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003075 const char *encoding,
3076 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003079 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003080
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 PyErr_BadInternalCall();
3083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003086 /* Decoding bytes objects is the most common case and should be fast */
3087 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003088 if (PyBytes_GET_SIZE(obj) == 0)
3089 _Py_RETURN_UNICODE_EMPTY();
3090 v = PyUnicode_Decode(
3091 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3092 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 return v;
3094 }
3095
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003096 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 PyErr_SetString(PyExc_TypeError,
3098 "decoding str is not supported");
3099 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003100 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003101
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003102 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3103 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3104 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003105 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003106 Py_TYPE(obj)->tp_name);
3107 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003108 }
Tim Petersced69f82003-09-16 20:30:58 +00003109
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003110 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003111 PyBuffer_Release(&buffer);
3112 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003114
Serhiy Storchaka05997252013-01-26 12:14:02 +02003115 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003116 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003117 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118}
3119
Victor Stinnerebe17e02016-10-12 13:57:45 +02003120/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3121 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3122 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003123int
3124_Py_normalize_encoding(const char *encoding,
3125 char *lower,
3126 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003128 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003129 char *l;
3130 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003131 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132
Victor Stinner942889a2016-09-05 15:40:10 -07003133 assert(encoding != NULL);
3134
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003135 e = encoding;
3136 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003137 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003138 punct = 0;
3139 while (1) {
3140 char c = *e;
3141 if (c == 0) {
3142 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003143 }
Victor Stinner942889a2016-09-05 15:40:10 -07003144
3145 if (Py_ISALNUM(c) || c == '.') {
3146 if (punct && l != lower) {
3147 if (l == l_end) {
3148 return 0;
3149 }
3150 *l++ = '_';
3151 }
3152 punct = 0;
3153
3154 if (l == l_end) {
3155 return 0;
3156 }
3157 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003158 }
3159 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003160 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003161 }
Victor Stinner942889a2016-09-05 15:40:10 -07003162
3163 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003164 }
3165 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003166 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003167}
3168
Alexander Belopolsky40018472011-02-26 01:02:56 +00003169PyObject *
3170PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003171 Py_ssize_t size,
3172 const char *encoding,
3173 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003174{
3175 PyObject *buffer = NULL, *unicode;
3176 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003177 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3178
3179 if (encoding == NULL) {
3180 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3181 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003182
Fred Drakee4315f52000-05-09 19:53:39 +00003183 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003184 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3185 char *lower = buflower;
3186
3187 /* Fast paths */
3188 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3189 lower += 3;
3190 if (*lower == '_') {
3191 /* Match "utf8" and "utf_8" */
3192 lower++;
3193 }
3194
3195 if (lower[0] == '8' && lower[1] == 0) {
3196 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3197 }
3198 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3199 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3200 }
3201 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3203 }
3204 }
3205 else {
3206 if (strcmp(lower, "ascii") == 0
3207 || strcmp(lower, "us_ascii") == 0) {
3208 return PyUnicode_DecodeASCII(s, size, errors);
3209 }
Steve Dowercc16be82016-09-08 10:35:16 -07003210 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003211 else if (strcmp(lower, "mbcs") == 0) {
3212 return PyUnicode_DecodeMBCS(s, size, errors);
3213 }
3214 #endif
3215 else if (strcmp(lower, "latin1") == 0
3216 || strcmp(lower, "latin_1") == 0
3217 || strcmp(lower, "iso_8859_1") == 0
3218 || strcmp(lower, "iso8859_1") == 0) {
3219 return PyUnicode_DecodeLatin1(s, size, errors);
3220 }
3221 }
Victor Stinner37296e82010-06-10 13:36:23 +00003222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223
3224 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003225 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003226 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003227 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003228 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 if (buffer == NULL)
3230 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003231 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 if (unicode == NULL)
3233 goto onError;
3234 if (!PyUnicode_Check(unicode)) {
3235 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003236 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3237 "use codecs.decode() to decode to arbitrary types",
3238 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003239 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 Py_DECREF(unicode);
3241 goto onError;
3242 }
3243 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003244 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003245
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 Py_XDECREF(buffer);
3248 return NULL;
3249}
3250
Alexander Belopolsky40018472011-02-26 01:02:56 +00003251PyObject *
3252PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003253 const char *encoding,
3254 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003256 if (!PyUnicode_Check(unicode)) {
3257 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003258 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003259 }
3260
Serhiy Storchaka00939072016-10-27 21:05:49 +03003261 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3262 "PyUnicode_AsDecodedObject() is deprecated; "
3263 "use PyCodec_Decode() to decode from str", 1) < 0)
3264 return NULL;
3265
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003268
3269 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003270 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003271}
3272
Alexander Belopolsky40018472011-02-26 01:02:56 +00003273PyObject *
3274PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003275 const char *encoding,
3276 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003277{
3278 PyObject *v;
3279
3280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
3282 goto onError;
3283 }
3284
Serhiy Storchaka00939072016-10-27 21:05:49 +03003285 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3286 "PyUnicode_AsDecodedUnicode() is deprecated; "
3287 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3288 return NULL;
3289
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003292
3293 /* Decode via the codec registry */
3294 v = PyCodec_Decode(unicode, encoding, errors);
3295 if (v == NULL)
3296 goto onError;
3297 if (!PyUnicode_Check(v)) {
3298 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003299 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3300 "use codecs.decode() to decode to arbitrary types",
3301 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003302 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003303 Py_DECREF(v);
3304 goto onError;
3305 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003306 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003307
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003309 return NULL;
3310}
3311
Alexander Belopolsky40018472011-02-26 01:02:56 +00003312PyObject *
3313PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003314 Py_ssize_t size,
3315 const char *encoding,
3316 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317{
3318 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003319
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003320 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3324 Py_DECREF(unicode);
3325 return v;
3326}
3327
Alexander Belopolsky40018472011-02-26 01:02:56 +00003328PyObject *
3329PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003330 const char *encoding,
3331 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003332{
3333 PyObject *v;
3334
3335 if (!PyUnicode_Check(unicode)) {
3336 PyErr_BadArgument();
3337 goto onError;
3338 }
3339
Serhiy Storchaka00939072016-10-27 21:05:49 +03003340 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3341 "PyUnicode_AsEncodedObject() is deprecated; "
3342 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3343 "or PyCodec_Encode() for generic encoding", 1) < 0)
3344 return NULL;
3345
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003346 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003348
3349 /* Encode via the codec registry */
3350 v = PyCodec_Encode(unicode, encoding, errors);
3351 if (v == NULL)
3352 goto onError;
3353 return v;
3354
Benjamin Peterson29060642009-01-31 22:14:21 +00003355 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003356 return NULL;
3357}
3358
Victor Stinner1b579672011-12-17 05:47:23 +01003359static int
3360locale_error_handler(const char *errors, int *surrogateescape)
3361{
Victor Stinner50149202015-09-22 00:26:54 +02003362 _Py_error_handler error_handler = get_error_handler(errors);
3363 switch (error_handler)
3364 {
3365 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003366 *surrogateescape = 0;
3367 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003368 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003369 *surrogateescape = 1;
3370 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003371 default:
3372 PyErr_Format(PyExc_ValueError,
3373 "only 'strict' and 'surrogateescape' error handlers "
3374 "are supported, not '%s'",
3375 errors);
3376 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003377 }
Victor Stinner1b579672011-12-17 05:47:23 +01003378}
3379
Victor Stinner2cba6b82018-01-10 22:46:15 +01003380static PyObject *
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003381unicode_encode_locale(PyObject *unicode, const char *errors,
3382 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383{
Victor Stinner1b579672011-12-17 05:47:23 +01003384 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003385 if (locale_error_handler(errors, &surrogateescape) < 0)
3386 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003388 Py_ssize_t wlen;
3389 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3390 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003391 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003392 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003394 Py_ssize_t wlen2 = wcslen(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003395 if (wlen2 != wlen) {
3396 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003397 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003398 return NULL;
3399 }
3400
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003401 char *str;
3402 size_t error_pos;
3403 const char *reason;
3404 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3405 current_locale, surrogateescape);
3406 if (res != 0) {
3407 if (res == -2) {
3408 PyObject *exc;
3409 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3410 "locale", unicode,
3411 (Py_ssize_t)error_pos,
3412 (Py_ssize_t)(error_pos+1),
3413 reason);
3414 if (exc != NULL) {
3415 PyCodec_StrictErrors(exc);
3416 Py_DECREF(exc);
3417 }
3418 return NULL;
Victor Stinner2cba6b82018-01-10 22:46:15 +01003419 }
3420 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003421 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003422 PyMem_Free(wstr);
3423 return NULL;
3424 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003427
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003428 PyObject *bytes = PyBytes_FromString(str);
3429 PyMem_RawFree(str);
3430 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003431}
3432
Victor Stinnerad158722010-10-27 00:25:46 +00003433PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003434PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3435{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003436 return unicode_encode_locale(unicode, errors, 1);
3437}
3438
3439PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003440PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003441{
Steve Dowercc16be82016-09-08 10:35:16 -07003442#if defined(__APPLE__)
3443 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003444#else
Victor Stinner793b5312011-04-27 00:24:21 +02003445 PyInterpreterState *interp = PyThreadState_GET()->interp;
3446 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3447 cannot use it to encode and decode filenames before it is loaded. Load
3448 the Python codec requires to encode at least its own filename. Use the C
3449 version of the locale codec until the codec registry is initialized and
3450 the Python codec is loaded.
3451
3452 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3453 cannot only rely on it: check also interp->fscodec_initialized for
3454 subinterpreters. */
3455 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003456 return PyUnicode_AsEncodedString(unicode,
3457 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003458 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003459 }
3460 else {
Victor Stinner2cba6b82018-01-10 22:46:15 +01003461 return unicode_encode_locale(unicode,
3462 Py_FileSystemDefaultEncodeErrors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003463 }
Victor Stinnerad158722010-10-27 00:25:46 +00003464#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003465}
3466
Alexander Belopolsky40018472011-02-26 01:02:56 +00003467PyObject *
3468PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003469 const char *encoding,
3470 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471{
3472 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003473 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003474
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 if (!PyUnicode_Check(unicode)) {
3476 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 }
Fred Drakee4315f52000-05-09 19:53:39 +00003479
Victor Stinner942889a2016-09-05 15:40:10 -07003480 if (encoding == NULL) {
3481 return _PyUnicode_AsUTF8String(unicode, errors);
3482 }
3483
Fred Drakee4315f52000-05-09 19:53:39 +00003484 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003485 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3486 char *lower = buflower;
3487
3488 /* Fast paths */
3489 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3490 lower += 3;
3491 if (*lower == '_') {
3492 /* Match "utf8" and "utf_8" */
3493 lower++;
3494 }
3495
3496 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003497 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003498 }
3499 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3500 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3501 }
3502 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3503 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3504 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003505 }
Victor Stinner942889a2016-09-05 15:40:10 -07003506 else {
3507 if (strcmp(lower, "ascii") == 0
3508 || strcmp(lower, "us_ascii") == 0) {
3509 return _PyUnicode_AsASCIIString(unicode, errors);
3510 }
Steve Dowercc16be82016-09-08 10:35:16 -07003511#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003512 else if (strcmp(lower, "mbcs") == 0) {
3513 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3514 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003515#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003516 else if (strcmp(lower, "latin1") == 0 ||
3517 strcmp(lower, "latin_1") == 0 ||
3518 strcmp(lower, "iso_8859_1") == 0 ||
3519 strcmp(lower, "iso8859_1") == 0) {
3520 return _PyUnicode_AsLatin1String(unicode, errors);
3521 }
3522 }
Victor Stinner37296e82010-06-10 13:36:23 +00003523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524
3525 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003526 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003528 return NULL;
3529
3530 /* The normal path */
3531 if (PyBytes_Check(v))
3532 return v;
3533
3534 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003536 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003537 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003538
3539 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003540 "encoder %s returned bytearray instead of bytes; "
3541 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003542 encoding);
3543 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003544 Py_DECREF(v);
3545 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003546 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003547
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003548 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3549 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003550 Py_DECREF(v);
3551 return b;
3552 }
3553
3554 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003555 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3556 "use codecs.encode() to encode to arbitrary types",
3557 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003558 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003559 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003560 return NULL;
3561}
3562
Alexander Belopolsky40018472011-02-26 01:02:56 +00003563PyObject *
3564PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003565 const char *encoding,
3566 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003567{
3568 PyObject *v;
3569
3570 if (!PyUnicode_Check(unicode)) {
3571 PyErr_BadArgument();
3572 goto onError;
3573 }
3574
Serhiy Storchaka00939072016-10-27 21:05:49 +03003575 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3576 "PyUnicode_AsEncodedUnicode() is deprecated; "
3577 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3578 return NULL;
3579
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003580 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003582
3583 /* Encode via the codec registry */
3584 v = PyCodec_Encode(unicode, encoding, errors);
3585 if (v == NULL)
3586 goto onError;
3587 if (!PyUnicode_Check(v)) {
3588 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003589 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3590 "use codecs.encode() to encode to arbitrary types",
3591 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003592 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003593 Py_DECREF(v);
3594 goto onError;
3595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003597
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 return NULL;
3600}
3601
Victor Stinner2cba6b82018-01-10 22:46:15 +01003602static PyObject*
3603unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3604 int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003605{
Victor Stinner1b579672011-12-17 05:47:23 +01003606 int surrogateescape;
Victor Stinner1b579672011-12-17 05:47:23 +01003607 if (locale_error_handler(errors, &surrogateescape) < 0)
3608 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003609
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003610 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3611 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003612 return NULL;
3613 }
3614
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003615 wchar_t *wstr;
3616 size_t wlen;
3617 const char *reason;
3618 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3619 current_locale, surrogateescape);
3620 if (res != 0) {
3621 if (res == -2) {
3622 PyObject *exc;
3623 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3624 "locale", str, len,
3625 (Py_ssize_t)wlen,
3626 (Py_ssize_t)(wlen + 1),
3627 reason);
3628 if (exc != NULL) {
3629 PyCodec_StrictErrors(exc);
3630 Py_DECREF(exc);
3631 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003632 }
3633 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003634 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003635 }
Victor Stinner2f197072011-12-17 07:08:30 +01003636 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003637 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003638
3639 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3640 PyMem_RawFree(wstr);
3641 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003642}
3643
3644PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003645PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3646 const char *errors)
3647{
Victor Stinner2cba6b82018-01-10 22:46:15 +01003648 return unicode_decode_locale(str, len, errors, 1);
3649}
3650
3651PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003652PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003653{
3654 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003655 return unicode_decode_locale(str, size, errors, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003656}
3657
3658
3659PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003660PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003661 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003662 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3663}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003664
Christian Heimes5894ba72007-11-04 11:43:14 +00003665PyObject*
3666PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3667{
Steve Dowercc16be82016-09-08 10:35:16 -07003668#if defined(__APPLE__)
3669 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003670#else
Victor Stinner793b5312011-04-27 00:24:21 +02003671 PyInterpreterState *interp = PyThreadState_GET()->interp;
3672 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3673 cannot use it to encode and decode filenames before it is loaded. Load
3674 the Python codec requires to encode at least its own filename. Use the C
3675 version of the locale codec until the codec registry is initialized and
3676 the Python codec is loaded.
3677
3678 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3679 cannot only rely on it: check also interp->fscodec_initialized for
3680 subinterpreters. */
3681 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003682 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003683 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003684 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003685 }
3686 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003687 return unicode_decode_locale(s, size,
3688 Py_FileSystemDefaultEncodeErrors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689 }
Victor Stinnerad158722010-10-27 00:25:46 +00003690#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003691}
3692
Martin v. Löwis011e8422009-05-05 04:43:17 +00003693
3694int
3695PyUnicode_FSConverter(PyObject* arg, void* addr)
3696{
Brett Cannonec6ce872016-09-06 15:50:29 -07003697 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003698 PyObject *output = NULL;
3699 Py_ssize_t size;
3700 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003701 if (arg == NULL) {
3702 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003703 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003704 return 1;
3705 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003706 path = PyOS_FSPath(arg);
3707 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003708 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003709 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003710 if (PyBytes_Check(path)) {
3711 output = path;
3712 }
3713 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3714 output = PyUnicode_EncodeFSDefault(path);
3715 Py_DECREF(path);
3716 if (!output) {
3717 return 0;
3718 }
3719 assert(PyBytes_Check(output));
3720 }
3721
Victor Stinner0ea2a462010-04-30 00:22:08 +00003722 size = PyBytes_GET_SIZE(output);
3723 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003724 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003725 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003726 Py_DECREF(output);
3727 return 0;
3728 }
3729 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003730 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003731}
3732
3733
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003734int
3735PyUnicode_FSDecoder(PyObject* arg, void* addr)
3736{
Brett Cannona5711202016-09-06 19:36:01 -07003737 int is_buffer = 0;
3738 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003739 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003740 if (arg == NULL) {
3741 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003742 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003743 return 1;
3744 }
Brett Cannona5711202016-09-06 19:36:01 -07003745
3746 is_buffer = PyObject_CheckBuffer(arg);
3747 if (!is_buffer) {
3748 path = PyOS_FSPath(arg);
3749 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003750 return 0;
3751 }
Brett Cannona5711202016-09-06 19:36:01 -07003752 }
3753 else {
3754 path = arg;
3755 Py_INCREF(arg);
3756 }
3757
3758 if (PyUnicode_Check(path)) {
3759 if (PyUnicode_READY(path) == -1) {
3760 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003761 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003762 }
3763 output = path;
3764 }
3765 else if (PyBytes_Check(path) || is_buffer) {
3766 PyObject *path_bytes = NULL;
3767
3768 if (!PyBytes_Check(path) &&
3769 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3770 "path should be string, bytes, or os.PathLike, not %.200s",
3771 Py_TYPE(arg)->tp_name)) {
3772 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003773 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003774 }
3775 path_bytes = PyBytes_FromObject(path);
3776 Py_DECREF(path);
3777 if (!path_bytes) {
3778 return 0;
3779 }
3780 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3781 PyBytes_GET_SIZE(path_bytes));
3782 Py_DECREF(path_bytes);
3783 if (!output) {
3784 return 0;
3785 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003786 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003787 else {
3788 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003789 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003790 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003791 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003792 return 0;
3793 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003794 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003795 Py_DECREF(output);
3796 return 0;
3797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003799 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003800 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003801 Py_DECREF(output);
3802 return 0;
3803 }
3804 *(PyObject**)addr = output;
3805 return Py_CLEANUP_SUPPORTED;
3806}
3807
3808
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003809const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003811{
Christian Heimesf3863112007-11-22 07:46:41 +00003812 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003814 if (!PyUnicode_Check(unicode)) {
3815 PyErr_BadArgument();
3816 return NULL;
3817 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003819 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003821 if (PyUnicode_UTF8(unicode) == NULL) {
3822 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003823 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 if (bytes == NULL)
3825 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3827 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003828 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 Py_DECREF(bytes);
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003833 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 PyBytes_AS_STRING(bytes),
3835 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 Py_DECREF(bytes);
3837 }
3838
3839 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003840 *psize = PyUnicode_UTF8_LENGTH(unicode);
3841 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003842}
3843
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003844const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3848}
3849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850Py_UNICODE *
3851PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 const unsigned char *one_byte;
3854#if SIZEOF_WCHAR_T == 4
3855 const Py_UCS2 *two_bytes;
3856#else
3857 const Py_UCS4 *four_bytes;
3858 const Py_UCS4 *ucs4_end;
3859 Py_ssize_t num_surrogates;
3860#endif
3861 wchar_t *w;
3862 wchar_t *wchar_end;
3863
3864 if (!PyUnicode_Check(unicode)) {
3865 PyErr_BadArgument();
3866 return NULL;
3867 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003868 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003870 assert(_PyUnicode_KIND(unicode) != 0);
3871 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003873 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003875 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3876 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 num_surrogates = 0;
3878
3879 for (; four_bytes < ucs4_end; ++four_bytes) {
3880 if (*four_bytes > 0xFFFF)
3881 ++num_surrogates;
3882 }
3883
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3885 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3886 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 PyErr_NoMemory();
3888 return NULL;
3889 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003892 w = _PyUnicode_WSTR(unicode);
3893 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3894 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3896 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003897 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003899 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3900 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 }
3902 else
3903 *w = *four_bytes;
3904
3905 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07003906 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 }
3908 }
3909 *w = 0;
3910#else
3911 /* sizeof(wchar_t) == 4 */
3912 Py_FatalError("Impossible unicode object state, wstr and str "
3913 "should share memory already.");
3914 return NULL;
3915#endif
3916 }
3917 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003918 if ((size_t)_PyUnicode_LENGTH(unicode) >
3919 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3920 PyErr_NoMemory();
3921 return NULL;
3922 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003923 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3924 (_PyUnicode_LENGTH(unicode) + 1));
3925 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 PyErr_NoMemory();
3927 return NULL;
3928 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003929 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3930 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3931 w = _PyUnicode_WSTR(unicode);
3932 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3935 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 for (; w < wchar_end; ++one_byte, ++w)
3937 *w = *one_byte;
3938 /* null-terminate the wstr */
3939 *w = 0;
3940 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 for (; w < wchar_end; ++two_bytes, ++w)
3945 *w = *two_bytes;
3946 /* null-terminate the wstr */
3947 *w = 0;
3948#else
3949 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 PyObject_FREE(_PyUnicode_WSTR(unicode));
3951 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 Py_FatalError("Impossible unicode object state, wstr "
3953 "and str should share memory already.");
3954 return NULL;
3955#endif
3956 }
3957 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07003958 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 }
3960 }
3961 }
3962 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003963 *size = PyUnicode_WSTR_LENGTH(unicode);
3964 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003965}
3966
Alexander Belopolsky40018472011-02-26 01:02:56 +00003967Py_UNICODE *
3968PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971}
3972
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03003973const Py_UNICODE *
3974_PyUnicode_AsUnicode(PyObject *unicode)
3975{
3976 Py_ssize_t size;
3977 const Py_UNICODE *wstr;
3978
3979 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3980 if (wstr && wcslen(wstr) != (size_t)size) {
3981 PyErr_SetString(PyExc_ValueError, "embedded null character");
3982 return NULL;
3983 }
3984 return wstr;
3985}
3986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987
Alexander Belopolsky40018472011-02-26 01:02:56 +00003988Py_ssize_t
3989PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990{
3991 if (!PyUnicode_Check(unicode)) {
3992 PyErr_BadArgument();
3993 goto onError;
3994 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003995 if (_PyUnicode_WSTR(unicode) == NULL) {
3996 if (PyUnicode_AsUnicode(unicode) == NULL)
3997 goto onError;
3998 }
3999 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 return -1;
4003}
4004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005Py_ssize_t
4006PyUnicode_GetLength(PyObject *unicode)
4007{
Victor Stinner07621332012-06-16 04:53:46 +02004008 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 PyErr_BadArgument();
4010 return -1;
4011 }
Victor Stinner07621332012-06-16 04:53:46 +02004012 if (PyUnicode_READY(unicode) == -1)
4013 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 return PyUnicode_GET_LENGTH(unicode);
4015}
4016
4017Py_UCS4
4018PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4019{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004020 void *data;
4021 int kind;
4022
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004023 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004024 PyErr_BadArgument();
4025 return (Py_UCS4)-1;
4026 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004027 if (PyUnicode_READY(unicode) == -1) {
4028 return (Py_UCS4)-1;
4029 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004030 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004031 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 return (Py_UCS4)-1;
4033 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004034 data = PyUnicode_DATA(unicode);
4035 kind = PyUnicode_KIND(unicode);
4036 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037}
4038
4039int
4040PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4041{
4042 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004043 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 return -1;
4045 }
Victor Stinner488fa492011-12-12 00:01:39 +01004046 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004047 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004048 PyErr_SetString(PyExc_IndexError, "string index out of range");
4049 return -1;
4050 }
Victor Stinner488fa492011-12-12 00:01:39 +01004051 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004052 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004053 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4054 PyErr_SetString(PyExc_ValueError, "character out of range");
4055 return -1;
4056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4058 index, ch);
4059 return 0;
4060}
4061
Alexander Belopolsky40018472011-02-26 01:02:56 +00004062const char *
4063PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004064{
Victor Stinner42cb4622010-09-01 19:39:01 +00004065 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004066}
4067
Victor Stinner554f3f02010-06-16 23:33:54 +00004068/* create or adjust a UnicodeDecodeError */
4069static void
4070make_decode_exception(PyObject **exceptionObject,
4071 const char *encoding,
4072 const char *input, Py_ssize_t length,
4073 Py_ssize_t startpos, Py_ssize_t endpos,
4074 const char *reason)
4075{
4076 if (*exceptionObject == NULL) {
4077 *exceptionObject = PyUnicodeDecodeError_Create(
4078 encoding, input, length, startpos, endpos, reason);
4079 }
4080 else {
4081 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4082 goto onError;
4083 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4084 goto onError;
4085 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4086 goto onError;
4087 }
4088 return;
4089
4090onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004091 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004092}
4093
Steve Dowercc16be82016-09-08 10:35:16 -07004094#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095/* error handling callback helper:
4096 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004097 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 and adjust various state variables.
4099 return 0 on success, -1 on error
4100*/
4101
Alexander Belopolsky40018472011-02-26 01:02:56 +00004102static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004103unicode_decode_call_errorhandler_wchar(
4104 const char *errors, PyObject **errorHandler,
4105 const char *encoding, const char *reason,
4106 const char **input, const char **inend, Py_ssize_t *startinpos,
4107 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4108 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004110 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111
4112 PyObject *restuple = NULL;
4113 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004114 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004115 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004116 Py_ssize_t requiredsize;
4117 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004118 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004119 wchar_t *repwstr;
4120 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004122 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4123 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 *errorHandler = PyCodec_LookupError(errors);
4127 if (*errorHandler == NULL)
4128 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 }
4130
Victor Stinner554f3f02010-06-16 23:33:54 +00004131 make_decode_exception(exceptionObject,
4132 encoding,
4133 *input, *inend - *input,
4134 *startinpos, *endinpos,
4135 reason);
4136 if (*exceptionObject == NULL)
4137 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004139 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004143 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004146 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004148
4149 /* Copy back the bytes variables, which might have been modified by the
4150 callback */
4151 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4152 if (!inputobj)
4153 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154 *input = PyBytes_AS_STRING(inputobj);
4155 insize = PyBytes_GET_SIZE(inputobj);
4156 *inend = *input + insize;
4157 /* we can DECREF safely, as the exception has another reference,
4158 so the object won't go away. */
4159 Py_DECREF(inputobj);
4160
4161 if (newpos<0)
4162 newpos = insize+newpos;
4163 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004164 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004165 goto onError;
4166 }
4167
4168 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4169 if (repwstr == NULL)
4170 goto onError;
4171 /* need more space? (at least enough for what we
4172 have+the replacement+the rest of the string (starting
4173 at the new input position), so we won't have to check space
4174 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004175 requiredsize = *outpos;
4176 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4177 goto overflow;
4178 requiredsize += repwlen;
4179 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4180 goto overflow;
4181 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004182 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004183 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004184 requiredsize = 2*outsize;
4185 if (unicode_resize(output, requiredsize) < 0)
4186 goto onError;
4187 }
4188 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4189 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004190 *endinpos = newpos;
4191 *inptr = *input + newpos;
4192
4193 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004194 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004195 return 0;
4196
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004197 overflow:
4198 PyErr_SetString(PyExc_OverflowError,
4199 "decoded result is too long for a Python string");
4200
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004201 onError:
4202 Py_XDECREF(restuple);
4203 return -1;
4204}
Steve Dowercc16be82016-09-08 10:35:16 -07004205#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004206
4207static int
4208unicode_decode_call_errorhandler_writer(
4209 const char *errors, PyObject **errorHandler,
4210 const char *encoding, const char *reason,
4211 const char **input, const char **inend, Py_ssize_t *startinpos,
4212 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4213 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4214{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004215 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004216
4217 PyObject *restuple = NULL;
4218 PyObject *repunicode = NULL;
4219 Py_ssize_t insize;
4220 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004221 Py_ssize_t replen;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004222 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004223 PyObject *inputobj = NULL;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004224 int need_to_grow = 0;
4225 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004226
4227 if (*errorHandler == NULL) {
4228 *errorHandler = PyCodec_LookupError(errors);
4229 if (*errorHandler == NULL)
4230 goto onError;
4231 }
4232
4233 make_decode_exception(exceptionObject,
4234 encoding,
4235 *input, *inend - *input,
4236 *startinpos, *endinpos,
4237 reason);
4238 if (*exceptionObject == NULL)
4239 goto onError;
4240
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004241 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 if (restuple == NULL)
4243 goto onError;
4244 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004245 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004246 goto onError;
4247 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004248 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004249 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004250
4251 /* Copy back the bytes variables, which might have been modified by the
4252 callback */
4253 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4254 if (!inputobj)
4255 goto onError;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004256 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004257 *input = PyBytes_AS_STRING(inputobj);
4258 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004259 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004260 /* we can DECREF safely, as the exception has another reference,
4261 so the object won't go away. */
4262 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004266 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004267 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004269 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270
Victor Stinner170ca6f2013-04-18 00:25:28 +02004271 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004272 if (replen > 1) {
4273 writer->min_length += replen - 1;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004274 need_to_grow = 1;
4275 }
4276 new_inptr = *input + newpos;
4277 if (*inend - new_inptr > remain) {
4278 /* We don't know the decoding algorithm here so we make the worst
4279 assumption that one byte decodes to one unicode character.
4280 If unfortunately one byte could decode to more unicode characters,
4281 the decoder may write out-of-bound then. Is it possible for the
4282 algorithms using this function? */
4283 writer->min_length += *inend - new_inptr - remain;
4284 need_to_grow = 1;
4285 }
4286 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004287 writer->overallocate = 1;
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08004288 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004289 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4290 goto onError;
4291 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004293 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 *endinpos = newpos;
Xiang Zhang86fdad02018-01-31 20:48:05 +08004296 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004299 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305}
4306
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307/* --- UTF-7 Codec -------------------------------------------------------- */
4308
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309/* See RFC2152 for details. We encode conservatively and decode liberally. */
4310
4311/* Three simple macros defining base-64. */
4312
4313/* Is c a base-64 character? */
4314
4315#define IS_BASE64(c) \
4316 (((c) >= 'A' && (c) <= 'Z') || \
4317 ((c) >= 'a' && (c) <= 'z') || \
4318 ((c) >= '0' && (c) <= '9') || \
4319 (c) == '+' || (c) == '/')
4320
4321/* given that c is a base-64 character, what is its base-64 value? */
4322
4323#define FROM_BASE64(c) \
4324 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4325 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4326 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4327 (c) == '+' ? 62 : 63)
4328
4329/* What is the base-64 character of the bottom 6 bits of n? */
4330
4331#define TO_BASE64(n) \
4332 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4333
4334/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4335 * decoded as itself. We are permissive on decoding; the only ASCII
4336 * byte not decoding to itself is the + which begins a base64
4337 * string. */
4338
4339#define DECODE_DIRECT(c) \
4340 ((c) <= 127 && (c) != '+')
4341
4342/* The UTF-7 encoder treats ASCII characters differently according to
4343 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4344 * the above). See RFC2152. This array identifies these different
4345 * sets:
4346 * 0 : "Set D"
4347 * alphanumeric and '(),-./:?
4348 * 1 : "Set O"
4349 * !"#$%&*;<=>@[]^_`{|}
4350 * 2 : "whitespace"
4351 * ht nl cr sp
4352 * 3 : special (must be base64 encoded)
4353 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4354 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355
Tim Petersced69f82003-09-16 20:30:58 +00004356static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357char utf7_category[128] = {
4358/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4359 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4360/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4361 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4362/* sp ! " # $ % & ' ( ) * + , - . / */
4363 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4364/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4366/* @ A B C D E F G H I J K L M N O */
4367 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4368/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4370/* ` a b c d e f g h i j k l m n o */
4371 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4372/* p q r s t u v w x y z { | } ~ del */
4373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374};
4375
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376/* ENCODE_DIRECT: this character should be encoded as itself. The
4377 * answer depends on whether we are encoding set O as itself, and also
4378 * on whether we are encoding whitespace as itself. RFC2152 makes it
4379 * clear that the answers to these questions vary between
4380 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004381
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382#define ENCODE_DIRECT(c, directO, directWS) \
4383 ((c) < 128 && (c) > 0 && \
4384 ((utf7_category[(c)] == 0) || \
4385 (directWS && (utf7_category[(c)] == 2)) || \
4386 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387
Alexander Belopolsky40018472011-02-26 01:02:56 +00004388PyObject *
4389PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004390 Py_ssize_t size,
4391 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004393 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4394}
4395
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396/* The decoder. The only state we preserve is our read position,
4397 * i.e. how many characters we have consumed. So if we end in the
4398 * middle of a shift sequence we have to back off the read position
4399 * and the output to the beginning of the sequence, otherwise we lose
4400 * all the shift state (seen bits, number of bits seen, high
4401 * surrogate). */
4402
Alexander Belopolsky40018472011-02-26 01:02:56 +00004403PyObject *
4404PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004405 Py_ssize_t size,
4406 const char *errors,
4407 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004410 Py_ssize_t startinpos;
4411 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 const char *errmsg = "";
4415 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004416 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 unsigned int base64bits = 0;
4418 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004419 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 PyObject *errorHandler = NULL;
4421 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004423 if (size == 0) {
4424 if (consumed)
4425 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004426 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004427 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004430 _PyUnicodeWriter_Init(&writer);
4431 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432
4433 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 e = s + size;
4435
4436 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004437 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004439 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004440
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441 if (inShift) { /* in a base-64 section */
4442 if (IS_BASE64(ch)) { /* consume a base-64 character */
4443 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4444 base64bits += 6;
4445 s++;
4446 if (base64bits >= 16) {
4447 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004448 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 base64bits -= 16;
4450 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004451 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 if (surrogate) {
4453 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004454 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4455 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004456 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004457 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004459 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 }
4461 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004462 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004463 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 }
4466 }
Victor Stinner551ac952011-11-29 22:58:13 +01004467 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 /* first surrogate */
4469 surrogate = outCh;
4470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004472 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 }
4475 }
4476 }
4477 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 if (base64bits > 0) { /* left-over bits */
4480 if (base64bits >= 6) {
4481 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004482 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483 errmsg = "partial character in shift sequence";
4484 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 else {
4487 /* Some bits remain; they should be zero */
4488 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004489 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 errmsg = "non-zero padding bits in shift sequence";
4491 goto utf7Error;
4492 }
4493 }
4494 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004495 if (surrogate && DECODE_DIRECT(ch)) {
4496 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4497 goto onError;
4498 }
4499 surrogate = 0;
4500 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 /* '-' is absorbed; other terminating
4502 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004503 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004505 }
4506 }
4507 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 s++; /* consume '+' */
4510 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004512 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004513 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 }
4515 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004517 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004520 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 }
4522 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004525 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004526 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 else {
4529 startinpos = s-starts;
4530 s++;
4531 errmsg = "unexpected special character";
4532 goto utf7Error;
4533 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 errors, &errorHandler,
4539 "utf7", errmsg,
4540 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004541 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 }
4544
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 /* end of string */
4546
4547 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4548 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004549 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 if (surrogate ||
4551 (base64bits >= 6) ||
4552 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 errors, &errorHandler,
4556 "utf7", "unterminated shift sequence",
4557 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004558 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 goto onError;
4560 if (s < e)
4561 goto restart;
4562 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564
4565 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004568 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004569 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004570 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004571 writer.kind, writer.data, shiftOutStart);
4572 Py_XDECREF(errorHandler);
4573 Py_XDECREF(exc);
4574 _PyUnicodeWriter_Dealloc(&writer);
4575 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004576 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004577 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 }
4579 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004580 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004582 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 Py_XDECREF(errorHandler);
4585 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004586 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 Py_XDECREF(errorHandler);
4590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004591 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 return NULL;
4593}
4594
4595
Alexander Belopolsky40018472011-02-26 01:02:56 +00004596PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004597_PyUnicode_EncodeUTF7(PyObject *str,
4598 int base64SetO,
4599 int base64WhiteSpace,
4600 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004601{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004602 int kind;
4603 void *data;
4604 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004605 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004607 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 unsigned int base64bits = 0;
4609 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 char * out;
4611 char * start;
4612
Benjamin Petersonbac79492012-01-14 13:34:47 -05004613 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004614 return NULL;
4615 kind = PyUnicode_KIND(str);
4616 data = PyUnicode_DATA(str);
4617 len = PyUnicode_GET_LENGTH(str);
4618
4619 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004622 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004623 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004624 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004625 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 if (v == NULL)
4627 return NULL;
4628
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004629 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004630 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004631 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 if (inShift) {
4634 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4635 /* shifting out */
4636 if (base64bits) { /* output remaining bits */
4637 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4638 base64buffer = 0;
4639 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 }
4641 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 /* Characters not in the BASE64 set implicitly unshift the sequence
4643 so no '-' is required, except if the character is itself a '-' */
4644 if (IS_BASE64(ch) || ch == '-') {
4645 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 *out++ = (char) ch;
4648 }
4649 else {
4650 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 else { /* not in a shift sequence */
4654 if (ch == '+') {
4655 *out++ = '+';
4656 *out++ = '-';
4657 }
4658 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4659 *out++ = (char) ch;
4660 }
4661 else {
4662 *out++ = '+';
4663 inShift = 1;
4664 goto encode_char;
4665 }
4666 }
4667 continue;
4668encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004670 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004671
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 /* code first surrogate */
4673 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004674 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 while (base64bits >= 6) {
4676 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4677 base64bits -= 6;
4678 }
4679 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004680 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 base64bits += 16;
4683 base64buffer = (base64buffer << 16) | ch;
4684 while (base64bits >= 6) {
4685 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4686 base64bits -= 6;
4687 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004688 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 if (base64bits)
4690 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4691 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004692 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004693 if (_PyBytes_Resize(&v, out - start) < 0)
4694 return NULL;
4695 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004697PyObject *
4698PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4699 Py_ssize_t size,
4700 int base64SetO,
4701 int base64WhiteSpace,
4702 const char *errors)
4703{
4704 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004705 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004706 if (tmp == NULL)
4707 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004708 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004709 base64WhiteSpace, errors);
4710 Py_DECREF(tmp);
4711 return result;
4712}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714#undef IS_BASE64
4715#undef FROM_BASE64
4716#undef TO_BASE64
4717#undef DECODE_DIRECT
4718#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720/* --- UTF-8 Codec -------------------------------------------------------- */
4721
Alexander Belopolsky40018472011-02-26 01:02:56 +00004722PyObject *
4723PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004724 Py_ssize_t size,
4725 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726{
Walter Dörwald69652032004-09-07 20:24:22 +00004727 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4728}
4729
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730#include "stringlib/asciilib.h"
4731#include "stringlib/codecs.h"
4732#include "stringlib/undef.h"
4733
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004734#include "stringlib/ucs1lib.h"
4735#include "stringlib/codecs.h"
4736#include "stringlib/undef.h"
4737
4738#include "stringlib/ucs2lib.h"
4739#include "stringlib/codecs.h"
4740#include "stringlib/undef.h"
4741
4742#include "stringlib/ucs4lib.h"
4743#include "stringlib/codecs.h"
4744#include "stringlib/undef.h"
4745
Antoine Pitrouab868312009-01-10 15:40:25 +00004746/* Mask to quickly check whether a C 'long' contains a
4747 non-ASCII, UTF8-encoded char. */
4748#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004749# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004750#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004751# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004752#else
4753# error C 'long' size should be either 4 or 8!
4754#endif
4755
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756static Py_ssize_t
4757ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004760 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004762 /*
4763 * Issue #17237: m68k is a bit different from most architectures in
4764 * that objects do not use "natural alignment" - for example, int and
4765 * long are only aligned at 2-byte boundaries. Therefore the assert()
4766 * won't work; also, tests have shown that skipping the "optimised
4767 * version" will even speed up m68k.
4768 */
4769#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004771 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4772 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 /* Fast path, see in STRINGLIB(utf8_decode) for
4774 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004775 /* Help allocation */
4776 const char *_p = p;
4777 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778 while (_p < aligned_end) {
4779 unsigned long value = *(const unsigned long *) _p;
4780 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 *((unsigned long *)q) = value;
4783 _p += SIZEOF_LONG;
4784 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004785 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786 p = _p;
4787 while (p < end) {
4788 if ((unsigned char)*p & 0x80)
4789 break;
4790 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004795#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796 while (p < end) {
4797 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4798 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004799 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004800 /* Help allocation */
4801 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 while (_p < aligned_end) {
4803 unsigned long value = *(unsigned long *) _p;
4804 if (value & ASCII_CHAR_MASK)
4805 break;
4806 _p += SIZEOF_LONG;
4807 }
4808 p = _p;
4809 if (_p == end)
4810 break;
4811 }
4812 if ((unsigned char)*p & 0x80)
4813 break;
4814 ++p;
4815 }
4816 memcpy(dest, start, p - start);
4817 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818}
Antoine Pitrouab868312009-01-10 15:40:25 +00004819
Victor Stinner785938e2011-12-11 20:09:03 +01004820PyObject *
4821PyUnicode_DecodeUTF8Stateful(const char *s,
4822 Py_ssize_t size,
4823 const char *errors,
4824 Py_ssize_t *consumed)
4825{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004826 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004827 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829
4830 Py_ssize_t startinpos;
4831 Py_ssize_t endinpos;
4832 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004833 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004835 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004836
4837 if (size == 0) {
4838 if (consumed)
4839 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004840 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004841 }
4842
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4844 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004845 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 *consumed = 1;
4847 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004848 }
4849
Victor Stinner8f674cc2013-04-17 23:02:17 +02004850 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004851 writer.min_length = size;
4852 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004853 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004854
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004855 writer.pos = ascii_decode(s, end, writer.data);
4856 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 while (s < end) {
4858 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004859 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004860
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004861 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004862 if (PyUnicode_IS_ASCII(writer.buffer))
4863 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004865 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004867 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004868 } else {
4869 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004870 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 }
4872
4873 switch (ch) {
4874 case 0:
4875 if (s == end || consumed)
4876 goto End;
4877 errmsg = "unexpected end of data";
4878 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004879 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 break;
4881 case 1:
4882 errmsg = "invalid start byte";
4883 startinpos = s - starts;
4884 endinpos = startinpos + 1;
4885 break;
4886 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004887 case 3:
4888 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 errmsg = "invalid continuation byte";
4890 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004891 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 break;
4893 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004894 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 goto onError;
4896 continue;
4897 }
4898
Victor Stinner1d65d912015-10-05 13:43:50 +02004899 if (error_handler == _Py_ERROR_UNKNOWN)
4900 error_handler = get_error_handler(errors);
4901
4902 switch (error_handler) {
4903 case _Py_ERROR_IGNORE:
4904 s += (endinpos - startinpos);
4905 break;
4906
4907 case _Py_ERROR_REPLACE:
4908 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4909 goto onError;
4910 s += (endinpos - startinpos);
4911 break;
4912
4913 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004914 {
4915 Py_ssize_t i;
4916
Victor Stinner1d65d912015-10-05 13:43:50 +02004917 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4918 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004919 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004920 ch = (Py_UCS4)(unsigned char)(starts[i]);
4921 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4922 ch + 0xdc00);
4923 writer.pos++;
4924 }
4925 s += (endinpos - startinpos);
4926 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004927 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004928
4929 default:
4930 if (unicode_decode_call_errorhandler_writer(
4931 errors, &error_handler_obj,
4932 "utf-8", errmsg,
4933 &starts, &end, &startinpos, &endinpos, &exc, &s,
4934 &writer))
4935 goto onError;
4936 }
Victor Stinner785938e2011-12-11 20:09:03 +01004937 }
4938
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 if (consumed)
4941 *consumed = s - starts;
4942
Victor Stinner1d65d912015-10-05 13:43:50 +02004943 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004945 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946
4947onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004948 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004950 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004952}
4953
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004954
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004955/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4956 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004957
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004958 On success, write a pointer to a newly allocated wide character string into
4959 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4960 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004961
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004962 On memory allocation failure, return -1.
4963
4964 On decoding error (if surrogateescape is zero), return -2. If wlen is
4965 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4966 is not NULL, write the decoding error message into *reason. */
4967int
4968_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4969 const char **reason, int surrogateescape)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004971 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004972 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 wchar_t *unicode;
4974 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004975
4976 /* Note: size will always be longer than the resulting Unicode
4977 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01004978 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004979 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004980 }
4981
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004982 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01004983 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004984 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01004985 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004986
4987 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004992#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 if (ch > 0xFF) {
4998#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07004999 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005001 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005002 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5004 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5005#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005006 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 else {
5008 if (!ch && s == e)
5009 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005010 if (!surrogateescape) {
5011 PyMem_RawFree(unicode );
5012 if (reason != NULL) {
5013 switch (ch) {
5014 case 0:
5015 *reason = "unexpected end of data";
5016 break;
5017 case 1:
5018 *reason = "invalid start byte";
5019 break;
5020 /* 2, 3, 4 */
5021 default:
5022 *reason = "invalid continuation byte";
5023 break;
5024 }
5025 }
5026 if (wlen != NULL) {
5027 *wlen = s - orig_s;
5028 }
5029 return -2;
5030 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005031 /* surrogateescape */
5032 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5033 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005034 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005035 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005036 if (wlen) {
5037 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005038 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005039 *wstr = unicode;
5040 return 0;
5041}
5042
5043wchar_t*
5044_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5045{
5046 wchar_t *wstr;
5047 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5048 if (res != 0) {
5049 return NULL;
5050 }
5051 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005052}
5053
Antoine Pitrouab868312009-01-10 15:40:25 +00005054
Victor Stinnere47e6982017-12-21 15:45:16 +01005055/* UTF-8 encoder using the surrogateescape error handler .
5056
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005057 On success, return 0 and write the newly allocated character string (use
5058 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005059
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005060 On encoding failure, return -2 and write the position of the invalid
5061 surrogate character into *error_pos (if error_pos is set) and the decoding
5062 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005063
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005064 On memory allocation failure, return -1. */
5065int
5066_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5067 const char **reason, int raw_malloc, int surrogateescape)
Victor Stinnere47e6982017-12-21 15:45:16 +01005068{
5069 const Py_ssize_t max_char_size = 4;
5070 Py_ssize_t len = wcslen(text);
5071
5072 assert(len >= 0);
5073
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005074 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5075 return -1;
5076 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005077 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005078 if (raw_malloc) {
5079 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005080 }
5081 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005082 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005083 }
5084 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005085 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005086 }
5087
5088 char *p = bytes;
5089 Py_ssize_t i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005090 for (i = 0; i < len; i++) {
5091 Py_UCS4 ch = text[i];
Victor Stinnere47e6982017-12-21 15:45:16 +01005092
5093 if (ch < 0x80) {
5094 /* Encode ASCII */
5095 *p++ = (char) ch;
5096
5097 }
5098 else if (ch < 0x0800) {
5099 /* Encode Latin-1 */
5100 *p++ = (char)(0xc0 | (ch >> 6));
5101 *p++ = (char)(0x80 | (ch & 0x3f));
5102 }
5103 else if (Py_UNICODE_IS_SURROGATE(ch)) {
5104 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005105 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005106 if (error_pos != NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005107 *error_pos = (size_t)i;
Victor Stinnere47e6982017-12-21 15:45:16 +01005108 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005109 if (reason != NULL) {
5110 *reason = "encoding error";
5111 }
5112 if (raw_malloc) {
5113 PyMem_RawFree(bytes);
5114 }
5115 else {
5116 PyMem_Free(bytes);
5117 }
5118 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005119 }
5120 *p++ = (char)(ch & 0xff);
5121 }
5122 else if (ch < 0x10000) {
5123 *p++ = (char)(0xe0 | (ch >> 12));
5124 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5125 *p++ = (char)(0x80 | (ch & 0x3f));
5126 }
5127 else { /* ch >= 0x10000 */
5128 assert(ch <= MAX_UNICODE);
5129 /* Encode UCS4 Unicode ordinals */
5130 *p++ = (char)(0xf0 | (ch >> 18));
5131 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5132 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5133 *p++ = (char)(0x80 | (ch & 0x3f));
5134 }
5135 }
5136 *p++ = '\0';
5137
5138 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005139 char *bytes2;
5140 if (raw_malloc) {
5141 bytes2 = PyMem_RawRealloc(bytes, final_size);
5142 }
5143 else {
5144 bytes2 = PyMem_Realloc(bytes, final_size);
5145 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005146 if (bytes2 == NULL) {
5147 if (error_pos != NULL) {
5148 *error_pos = (size_t)-1;
5149 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150 if (raw_malloc) {
5151 PyMem_RawFree(bytes);
5152 }
5153 else {
5154 PyMem_Free(bytes);
5155 }
5156 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005157 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005158 *str = bytes2;
5159 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005160}
5161
5162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005163/* Primary internal function which creates utf8 encoded bytes objects.
5164
5165 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005166 and allocate exactly as much space needed at the end. Else allocate the
5167 maximum possible needed (4 result bytes per Unicode character), and return
5168 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005169*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005170PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005171_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172{
Victor Stinner6099a032011-12-18 14:22:26 +01005173 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174 void *data;
5175 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 if (!PyUnicode_Check(unicode)) {
5178 PyErr_BadArgument();
5179 return NULL;
5180 }
5181
5182 if (PyUnicode_READY(unicode) == -1)
5183 return NULL;
5184
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005185 if (PyUnicode_UTF8(unicode))
5186 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5187 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005188
5189 kind = PyUnicode_KIND(unicode);
5190 data = PyUnicode_DATA(unicode);
5191 size = PyUnicode_GET_LENGTH(unicode);
5192
Benjamin Petersonead6b532011-12-20 17:23:42 -06005193 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005194 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005195 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005196 case PyUnicode_1BYTE_KIND:
5197 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5198 assert(!PyUnicode_IS_ASCII(unicode));
5199 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5200 case PyUnicode_2BYTE_KIND:
5201 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5202 case PyUnicode_4BYTE_KIND:
5203 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205}
5206
Alexander Belopolsky40018472011-02-26 01:02:56 +00005207PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005208PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5209 Py_ssize_t size,
5210 const char *errors)
5211{
5212 PyObject *v, *unicode;
5213
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005214 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005215 if (unicode == NULL)
5216 return NULL;
5217 v = _PyUnicode_AsUTF8String(unicode, errors);
5218 Py_DECREF(unicode);
5219 return v;
5220}
5221
5222PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005223PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005225 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226}
5227
Walter Dörwald41980ca2007-08-16 21:55:45 +00005228/* --- UTF-32 Codec ------------------------------------------------------- */
5229
5230PyObject *
5231PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235{
5236 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5237}
5238
5239PyObject *
5240PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 Py_ssize_t size,
5242 const char *errors,
5243 int *byteorder,
5244 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245{
5246 const char *starts = s;
5247 Py_ssize_t startinpos;
5248 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005249 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005250 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005251 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005252 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 PyObject *errorHandler = NULL;
5255 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005256
Walter Dörwald41980ca2007-08-16 21:55:45 +00005257 q = (unsigned char *)s;
5258 e = q + size;
5259
5260 if (byteorder)
5261 bo = *byteorder;
5262
5263 /* Check for BOM marks (U+FEFF) in the input and adjust current
5264 byte order setting accordingly. In native mode, the leading BOM
5265 mark is skipped, in all other modes, it is copied to the output
5266 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005267 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005268 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005269 if (bom == 0x0000FEFF) {
5270 bo = -1;
5271 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005273 else if (bom == 0xFFFE0000) {
5274 bo = 1;
5275 q += 4;
5276 }
5277 if (byteorder)
5278 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005279 }
5280
Victor Stinnere64322e2012-10-30 23:12:47 +01005281 if (q == e) {
5282 if (consumed)
5283 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005284 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005285 }
5286
Victor Stinnere64322e2012-10-30 23:12:47 +01005287#ifdef WORDS_BIGENDIAN
5288 le = bo < 0;
5289#else
5290 le = bo <= 0;
5291#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005292 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005293
Victor Stinner8f674cc2013-04-17 23:02:17 +02005294 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005295 writer.min_length = (e - q + 3) / 4;
5296 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005297 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005298
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 while (1) {
5300 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005301 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005302
Victor Stinnere64322e2012-10-30 23:12:47 +01005303 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 enum PyUnicode_Kind kind = writer.kind;
5305 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005306 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 if (le) {
5309 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005310 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005311 if (ch > maxch)
5312 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005313 if (kind != PyUnicode_1BYTE_KIND &&
5314 Py_UNICODE_IS_SURROGATE(ch))
5315 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005317 q += 4;
5318 } while (q <= last);
5319 }
5320 else {
5321 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005322 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005323 if (ch > maxch)
5324 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 if (kind != PyUnicode_1BYTE_KIND &&
5326 Py_UNICODE_IS_SURROGATE(ch))
5327 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005328 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005329 q += 4;
5330 } while (q <= last);
5331 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005332 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005333 }
5334
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005335 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005336 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005337 startinpos = ((const char *)q) - starts;
5338 endinpos = startinpos + 4;
5339 }
5340 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005341 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005343 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005345 startinpos = ((const char *)q) - starts;
5346 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005348 else {
5349 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005351 goto onError;
5352 q += 4;
5353 continue;
5354 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005355 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005356 startinpos = ((const char *)q) - starts;
5357 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005359
5360 /* The remaining input chars are ignored if the callback
5361 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005362 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005364 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005368 }
5369
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005372
Walter Dörwald41980ca2007-08-16 21:55:45 +00005373 Py_XDECREF(errorHandler);
5374 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005375 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005376
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005378 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379 Py_XDECREF(errorHandler);
5380 Py_XDECREF(exc);
5381 return NULL;
5382}
5383
5384PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385_PyUnicode_EncodeUTF32(PyObject *str,
5386 const char *errors,
5387 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005388{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005389 enum PyUnicode_Kind kind;
5390 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005391 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005392 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005393 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005394#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005397 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005398#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005399 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005400 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 PyObject *errorHandler = NULL;
5402 PyObject *exc = NULL;
5403 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005405 if (!PyUnicode_Check(str)) {
5406 PyErr_BadArgument();
5407 return NULL;
5408 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005409 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005410 return NULL;
5411 kind = PyUnicode_KIND(str);
5412 data = PyUnicode_DATA(str);
5413 len = PyUnicode_GET_LENGTH(str);
5414
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005415 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005416 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005418 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419 if (v == NULL)
5420 return NULL;
5421
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005422 /* output buffer is 4-bytes aligned */
5423 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005424 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005425 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005426 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005427 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005428 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005429
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005430 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005432 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005433 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 else
5435 encoding = "utf-32";
5436
5437 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005438 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5439 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005440 }
5441
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005442 pos = 0;
5443 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005445
5446 if (kind == PyUnicode_2BYTE_KIND) {
5447 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5448 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005450 else {
5451 assert(kind == PyUnicode_4BYTE_KIND);
5452 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5453 &out, native_ordering);
5454 }
5455 if (pos == len)
5456 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005457
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005458 rep = unicode_encode_call_errorhandler(
5459 errors, &errorHandler,
5460 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005461 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005462 if (!rep)
5463 goto error;
5464
5465 if (PyBytes_Check(rep)) {
5466 repsize = PyBytes_GET_SIZE(rep);
5467 if (repsize & 3) {
5468 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005469 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 "surrogates not allowed");
5471 goto error;
5472 }
5473 moreunits = repsize / 4;
5474 }
5475 else {
5476 assert(PyUnicode_Check(rep));
5477 if (PyUnicode_READY(rep) < 0)
5478 goto error;
5479 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5480 if (!PyUnicode_IS_ASCII(rep)) {
5481 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005482 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 "surrogates not allowed");
5484 goto error;
5485 }
5486 }
5487
5488 /* four bytes are reserved for each surrogate */
5489 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005490 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005491 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005492 /* integer overflow */
5493 PyErr_NoMemory();
5494 goto error;
5495 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005496 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005498 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 }
5500
5501 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005502 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005503 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005504 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005505 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005506 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5507 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 }
5509
5510 Py_CLEAR(rep);
5511 }
5512
5513 /* Cut back to size actually needed. This is necessary for, for example,
5514 encoding of a string containing isolated surrogates and the 'ignore'
5515 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005516 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005517 if (nsize != PyBytes_GET_SIZE(v))
5518 _PyBytes_Resize(&v, nsize);
5519 Py_XDECREF(errorHandler);
5520 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005521 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005522 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005523 error:
5524 Py_XDECREF(rep);
5525 Py_XDECREF(errorHandler);
5526 Py_XDECREF(exc);
5527 Py_XDECREF(v);
5528 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005529}
5530
Alexander Belopolsky40018472011-02-26 01:02:56 +00005531PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005532PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5533 Py_ssize_t size,
5534 const char *errors,
5535 int byteorder)
5536{
5537 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005538 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005539 if (tmp == NULL)
5540 return NULL;
5541 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5542 Py_DECREF(tmp);
5543 return result;
5544}
5545
5546PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005547PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548{
Victor Stinnerb960b342011-11-20 19:12:52 +01005549 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550}
5551
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552/* --- UTF-16 Codec ------------------------------------------------------- */
5553
Tim Peters772747b2001-08-09 22:21:55 +00005554PyObject *
5555PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 Py_ssize_t size,
5557 const char *errors,
5558 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559{
Walter Dörwald69652032004-09-07 20:24:22 +00005560 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5561}
5562
5563PyObject *
5564PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 Py_ssize_t size,
5566 const char *errors,
5567 int *byteorder,
5568 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005571 Py_ssize_t startinpos;
5572 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005575 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005576 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005577 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 PyObject *errorHandler = NULL;
5579 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
Tim Peters772747b2001-08-09 22:21:55 +00005582 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584
5585 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005586 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005588 /* Check for BOM marks (U+FEFF) in the input and adjust current
5589 byte order setting accordingly. In native mode, the leading BOM
5590 mark is skipped, in all other modes, it is copied to the output
5591 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005592 if (bo == 0 && size >= 2) {
5593 const Py_UCS4 bom = (q[1] << 8) | q[0];
5594 if (bom == 0xFEFF) {
5595 q += 2;
5596 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005598 else if (bom == 0xFFFE) {
5599 q += 2;
5600 bo = 1;
5601 }
5602 if (byteorder)
5603 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605
Antoine Pitrou63065d72012-05-15 23:48:04 +02005606 if (q == e) {
5607 if (consumed)
5608 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005609 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005610 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611
Christian Heimes743e0cd2012-10-17 23:52:17 +02005612#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005613 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005614 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005615#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005616 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005618#endif
Tim Peters772747b2001-08-09 22:21:55 +00005619
Antoine Pitrou63065d72012-05-15 23:48:04 +02005620 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang86fdad02018-01-31 20:48:05 +08005621 character count normally. Error handler will take care of
5622 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005623 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005624 writer.min_length = (e - q + 1) / 2;
5625 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005626 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005627
Antoine Pitrou63065d72012-05-15 23:48:04 +02005628 while (1) {
5629 Py_UCS4 ch = 0;
5630 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005632 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005636 native_ordering);
5637 else
5638 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005640 native_ordering);
5641 } else if (kind == PyUnicode_2BYTE_KIND) {
5642 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005643 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005644 native_ordering);
5645 } else {
5646 assert(kind == PyUnicode_4BYTE_KIND);
5647 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005648 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005649 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005650 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005651 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652
Antoine Pitrou63065d72012-05-15 23:48:04 +02005653 switch (ch)
5654 {
5655 case 0:
5656 /* remaining byte at the end? (size should be even) */
5657 if (q == e || consumed)
5658 goto End;
5659 errmsg = "truncated data";
5660 startinpos = ((const char *)q) - starts;
5661 endinpos = ((const char *)e) - starts;
5662 break;
5663 /* The remaining input chars are ignored if the callback
5664 chooses to skip the input */
5665 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005666 q -= 2;
5667 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005668 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005669 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005670 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005671 endinpos = ((const char *)e) - starts;
5672 break;
5673 case 2:
5674 errmsg = "illegal encoding";
5675 startinpos = ((const char *)q) - 2 - starts;
5676 endinpos = startinpos + 2;
5677 break;
5678 case 3:
5679 errmsg = "illegal UTF-16 surrogate";
5680 startinpos = ((const char *)q) - 4 - starts;
5681 endinpos = startinpos + 2;
5682 break;
5683 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005684 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 continue;
5687 }
5688
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005690 errors,
5691 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005692 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005693 &starts,
5694 (const char **)&e,
5695 &startinpos,
5696 &endinpos,
5697 &exc,
5698 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 }
5702
Antoine Pitrou63065d72012-05-15 23:48:04 +02005703End:
Walter Dörwald69652032004-09-07 20:24:22 +00005704 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 Py_XDECREF(errorHandler);
5708 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005709 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005712 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 Py_XDECREF(errorHandler);
5714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 return NULL;
5716}
5717
Tim Peters772747b2001-08-09 22:21:55 +00005718PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719_PyUnicode_EncodeUTF16(PyObject *str,
5720 const char *errors,
5721 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005723 enum PyUnicode_Kind kind;
5724 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005725 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005726 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005727 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005728 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005729#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005730 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005731#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005732 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005733#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734 const char *encoding;
5735 Py_ssize_t nsize, pos;
5736 PyObject *errorHandler = NULL;
5737 PyObject *exc = NULL;
5738 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 if (!PyUnicode_Check(str)) {
5741 PyErr_BadArgument();
5742 return NULL;
5743 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005744 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005745 return NULL;
5746 kind = PyUnicode_KIND(str);
5747 data = PyUnicode_DATA(str);
5748 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005749
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005751 if (kind == PyUnicode_4BYTE_KIND) {
5752 const Py_UCS4 *in = (const Py_UCS4 *)data;
5753 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005754 while (in < end) {
5755 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
5758 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005760 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 nsize = len + pairs + (byteorder == 0);
5764 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005765 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005769 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005770 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005771 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005772 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005773 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 }
5775 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005776 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005777 }
Tim Peters772747b2001-08-09 22:21:55 +00005778
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 if (kind == PyUnicode_1BYTE_KIND) {
5780 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5781 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005782 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005783
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005784 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005785 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005786 }
5787 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005788 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005789 }
5790 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005791 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005792 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793
5794 pos = 0;
5795 while (pos < len) {
5796 Py_ssize_t repsize, moreunits;
5797
5798 if (kind == PyUnicode_2BYTE_KIND) {
5799 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5800 &out, native_ordering);
5801 }
5802 else {
5803 assert(kind == PyUnicode_4BYTE_KIND);
5804 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5805 &out, native_ordering);
5806 }
5807 if (pos == len)
5808 break;
5809
5810 rep = unicode_encode_call_errorhandler(
5811 errors, &errorHandler,
5812 encoding, "surrogates not allowed",
5813 str, &exc, pos, pos + 1, &pos);
5814 if (!rep)
5815 goto error;
5816
5817 if (PyBytes_Check(rep)) {
5818 repsize = PyBytes_GET_SIZE(rep);
5819 if (repsize & 1) {
5820 raise_encode_exception(&exc, encoding,
5821 str, pos - 1, pos,
5822 "surrogates not allowed");
5823 goto error;
5824 }
5825 moreunits = repsize / 2;
5826 }
5827 else {
5828 assert(PyUnicode_Check(rep));
5829 if (PyUnicode_READY(rep) < 0)
5830 goto error;
5831 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5832 if (!PyUnicode_IS_ASCII(rep)) {
5833 raise_encode_exception(&exc, encoding,
5834 str, pos - 1, pos,
5835 "surrogates not allowed");
5836 goto error;
5837 }
5838 }
5839
5840 /* two bytes are reserved for each surrogate */
5841 if (moreunits > 1) {
5842 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005843 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 /* integer overflow */
5845 PyErr_NoMemory();
5846 goto error;
5847 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005848 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 goto error;
5850 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5851 }
5852
5853 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005854 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 out += moreunits;
5856 } else /* rep is unicode */ {
5857 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5858 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5859 &out, native_ordering);
5860 }
5861
5862 Py_CLEAR(rep);
5863 }
5864
5865 /* Cut back to size actually needed. This is necessary for, for example,
5866 encoding of a string containing isolated surrogates and the 'ignore' handler
5867 is used. */
5868 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5869 if (nsize != PyBytes_GET_SIZE(v))
5870 _PyBytes_Resize(&v, nsize);
5871 Py_XDECREF(errorHandler);
5872 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005873 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005874 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005875 error:
5876 Py_XDECREF(rep);
5877 Py_XDECREF(errorHandler);
5878 Py_XDECREF(exc);
5879 Py_XDECREF(v);
5880 return NULL;
5881#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882}
5883
Alexander Belopolsky40018472011-02-26 01:02:56 +00005884PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5886 Py_ssize_t size,
5887 const char *errors,
5888 int byteorder)
5889{
5890 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005891 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 if (tmp == NULL)
5893 return NULL;
5894 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5895 Py_DECREF(tmp);
5896 return result;
5897}
5898
5899PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005900PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903}
5904
5905/* --- Unicode Escape Codec ----------------------------------------------- */
5906
Fredrik Lundh06d12682001-01-24 07:59:11 +00005907static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005908
Alexander Belopolsky40018472011-02-26 01:02:56 +00005909PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005910_PyUnicode_DecodeUnicodeEscape(const char *s,
5911 Py_ssize_t size,
5912 const char *errors,
5913 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005916 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 PyObject *errorHandler = NULL;
5919 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920
Eric V. Smith42454af2016-10-31 09:22:08 -04005921 // so we can remember if we've seen an invalid escape char or not
5922 *first_invalid_escape = NULL;
5923
Victor Stinner62ec3312016-09-06 17:04:34 -07005924 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005925 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005926 }
5927 /* Escaped strings will always be longer than the resulting
5928 Unicode string, so we start with size here and then reduce the
5929 length after conversion to the true value.
5930 (but if the error callback returns a long replacement string
5931 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005932 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005933 writer.min_length = size;
5934 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5935 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005936 }
5937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 end = s + size;
5939 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005940 unsigned char c = (unsigned char) *s++;
5941 Py_UCS4 ch;
5942 int count;
5943 Py_ssize_t startinpos;
5944 Py_ssize_t endinpos;
5945 const char *message;
5946
5947#define WRITE_ASCII_CHAR(ch) \
5948 do { \
5949 assert(ch <= 127); \
5950 assert(writer.pos < writer.size); \
5951 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5952 } while(0)
5953
5954#define WRITE_CHAR(ch) \
5955 do { \
5956 if (ch <= writer.maxchar) { \
5957 assert(writer.pos < writer.size); \
5958 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5959 } \
5960 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5961 goto onError; \
5962 } \
5963 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
5965 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005966 if (c != '\\') {
5967 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 continue;
5969 }
5970
Victor Stinner62ec3312016-09-06 17:04:34 -07005971 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 if (s >= end) {
5974 message = "\\ at end of string";
5975 goto error;
5976 }
5977 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005978
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005980 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005983 case '\n': continue;
5984 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5985 case '\'': WRITE_ASCII_CHAR('\''); continue;
5986 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5987 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005989 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5990 case 't': WRITE_ASCII_CHAR('\t'); continue;
5991 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5992 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005994 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005995 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005996 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case '0': case '1': case '2': case '3':
6000 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006001 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006002 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006003 ch = (ch<<3) + *s++ - '0';
6004 if (s < end && '0' <= *s && *s <= '7') {
6005 ch = (ch<<3) + *s++ - '0';
6006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006008 WRITE_CHAR(ch);
6009 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* hex escapes */
6012 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006014 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006015 message = "truncated \\xXX escape";
6016 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006020 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006021 message = "truncated \\uXXXX escape";
6022 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006025 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006026 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006027 message = "truncated \\UXXXXXXXX escape";
6028 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006029 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006030 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006031 ch <<= 4;
6032 if (c >= '0' && c <= '9') {
6033 ch += c - '0';
6034 }
6035 else if (c >= 'a' && c <= 'f') {
6036 ch += c - ('a' - 10);
6037 }
6038 else if (c >= 'A' && c <= 'F') {
6039 ch += c - ('A' - 10);
6040 }
6041 else {
6042 break;
6043 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006044 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006045 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006046 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006047 }
6048
6049 /* when we get here, ch is a 32-bit unicode character */
6050 if (ch > MAX_UNICODE) {
6051 message = "illegal Unicode character";
6052 goto error;
6053 }
6054
6055 WRITE_CHAR(ch);
6056 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006057
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006059 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006060 if (ucnhash_CAPI == NULL) {
6061 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006062 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6063 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006064 if (ucnhash_CAPI == NULL) {
6065 PyErr_SetString(
6066 PyExc_UnicodeError,
6067 "\\N escapes not supported (can't load unicodedata module)"
6068 );
6069 goto onError;
6070 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006072
6073 message = "malformed \\N character escape";
Miss Islington (bot)9fbcb142018-11-13 16:39:36 -08006074 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 const char *start = ++s;
6076 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006077 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006079 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 namelen = s - start;
6081 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006084 ch = 0xffffffff; /* in case 'getcode' messes up */
6085 if (namelen <= INT_MAX &&
6086 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6087 &ch, 0)) {
6088 assert(ch <= MAX_UNICODE);
6089 WRITE_CHAR(ch);
6090 continue;
6091 }
6092 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006093 }
6094 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006095 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006096
6097 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006098 if (*first_invalid_escape == NULL) {
6099 *first_invalid_escape = s-1; /* Back up one char, since we've
6100 already incremented s. */
6101 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006102 WRITE_ASCII_CHAR('\\');
6103 WRITE_CHAR(c);
6104 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006106
6107 error:
6108 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006109 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006110 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006111 errors, &errorHandler,
6112 "unicodeescape", message,
6113 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006114 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006115 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006116 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006117 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006118
6119#undef WRITE_ASCII_CHAR
6120#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006122
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006123 Py_XDECREF(errorHandler);
6124 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006126
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 Py_XDECREF(errorHandler);
6130 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return NULL;
6132}
6133
Eric V. Smith42454af2016-10-31 09:22:08 -04006134PyObject *
6135PyUnicode_DecodeUnicodeEscape(const char *s,
6136 Py_ssize_t size,
6137 const char *errors)
6138{
6139 const char *first_invalid_escape;
6140 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6141 &first_invalid_escape);
6142 if (result == NULL)
6143 return NULL;
6144 if (first_invalid_escape != NULL) {
6145 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6146 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006147 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006148 Py_DECREF(result);
6149 return NULL;
6150 }
6151 }
6152 return result;
6153}
6154
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006155/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
Alexander Belopolsky40018472011-02-26 01:02:56 +00006157PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006161 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006165 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
Ezio Melottie7f90372012-10-05 03:33:31 +03006167 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006168 escape.
6169
Ezio Melottie7f90372012-10-05 03:33:31 +03006170 For UCS1 strings it's '\xxx', 4 bytes per source character.
6171 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6172 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006173 */
6174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 if (!PyUnicode_Check(unicode)) {
6176 PyErr_BadArgument();
6177 return NULL;
6178 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 }
Victor Stinner358af132015-10-12 22:36:57 +02006182
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006184 if (len == 0) {
6185 return PyBytes_FromStringAndSize(NULL, 0);
6186 }
6187
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 kind = PyUnicode_KIND(unicode);
6189 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6191 bytes, and 1 byte characters 4. */
6192 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006193 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 return PyErr_NoMemory();
6195 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006196 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 if (repr == NULL) {
6198 return NULL;
6199 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006203 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006204
Victor Stinner62ec3312016-09-06 17:04:34 -07006205 /* U+0000-U+00ff range */
6206 if (ch < 0x100) {
6207 if (ch >= ' ' && ch < 127) {
6208 if (ch != '\\') {
6209 /* Copy printable US ASCII as-is */
6210 *p++ = (char) ch;
6211 }
6212 /* Escape backslashes */
6213 else {
6214 *p++ = '\\';
6215 *p++ = '\\';
6216 }
6217 }
Victor Stinner358af132015-10-12 22:36:57 +02006218
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 /* Map special whitespace to '\t', \n', '\r' */
6220 else if (ch == '\t') {
6221 *p++ = '\\';
6222 *p++ = 't';
6223 }
6224 else if (ch == '\n') {
6225 *p++ = '\\';
6226 *p++ = 'n';
6227 }
6228 else if (ch == '\r') {
6229 *p++ = '\\';
6230 *p++ = 'r';
6231 }
6232
6233 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6234 else {
6235 *p++ = '\\';
6236 *p++ = 'x';
6237 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6238 *p++ = Py_hexdigits[ch & 0x000F];
6239 }
Tim Petersced69f82003-09-16 20:30:58 +00006240 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006241 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 *p++ = '\\';
6244 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006245 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6246 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6247 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6248 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6251 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006252
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 /* Make sure that the first two digits are zero */
6254 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006255 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 *p++ = 'U';
6257 *p++ = '0';
6258 *p++ = '0';
6259 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6260 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6261 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6262 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6263 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6264 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 assert(p - PyBytes_AS_STRING(repr) > 0);
6269 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6270 return NULL;
6271 }
6272 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273}
6274
Alexander Belopolsky40018472011-02-26 01:02:56 +00006275PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006276PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6277 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006279 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006280 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 }
6284
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006285 result = PyUnicode_AsUnicodeEscapeString(tmp);
6286 Py_DECREF(tmp);
6287 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288}
6289
6290/* --- Raw Unicode Escape Codec ------------------------------------------- */
6291
Alexander Belopolsky40018472011-02-26 01:02:56 +00006292PyObject *
6293PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006294 Py_ssize_t size,
6295 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 PyObject *errorHandler = NULL;
6301 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006302
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006304 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006306
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 /* Escaped strings will always be longer than the resulting
6308 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006309 length after conversion to the true value. (But decoding error
6310 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006311 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 writer.min_length = size;
6313 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6314 goto onError;
6315 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006316
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 end = s + size;
6318 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 unsigned char c = (unsigned char) *s++;
6320 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006321 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006322 Py_ssize_t startinpos;
6323 Py_ssize_t endinpos;
6324 const char *message;
6325
6326#define WRITE_CHAR(ch) \
6327 do { \
6328 if (ch <= writer.maxchar) { \
6329 assert(writer.pos < writer.size); \
6330 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6331 } \
6332 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6333 goto onError; \
6334 } \
6335 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 if (c != '\\' || s >= end) {
6339 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006342
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 c = (unsigned char) *s++;
6344 if (c == 'u') {
6345 count = 4;
6346 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 else if (c == 'U') {
6349 count = 8;
6350 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006351 }
6352 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006353 assert(writer.pos < writer.size);
6354 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6355 WRITE_CHAR(c);
6356 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006357 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006358 startinpos = s - starts - 2;
6359
6360 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6361 for (ch = 0; count && s < end; ++s, --count) {
6362 c = (unsigned char)*s;
6363 ch <<= 4;
6364 if (c >= '0' && c <= '9') {
6365 ch += c - '0';
6366 }
6367 else if (c >= 'a' && c <= 'f') {
6368 ch += c - ('a' - 10);
6369 }
6370 else if (c >= 'A' && c <= 'F') {
6371 ch += c - ('A' - 10);
6372 }
6373 else {
6374 break;
6375 }
6376 }
6377 if (!count) {
6378 if (ch <= MAX_UNICODE) {
6379 WRITE_CHAR(ch);
6380 continue;
6381 }
6382 message = "\\Uxxxxxxxx out of range";
6383 }
6384
6385 endinpos = s-starts;
6386 writer.min_length = end - s + writer.pos;
6387 if (unicode_decode_call_errorhandler_writer(
6388 errors, &errorHandler,
6389 "rawunicodeescape", message,
6390 &starts, &end, &startinpos, &endinpos, &exc, &s,
6391 &writer)) {
6392 goto onError;
6393 }
Miss Islington (bot)09819ef2018-02-12 23:15:57 -08006394 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006395
6396#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006400 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006401
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006403 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 Py_XDECREF(errorHandler);
6405 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006407
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408}
6409
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410
Alexander Belopolsky40018472011-02-26 01:02:56 +00006411PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006412PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006417 int kind;
6418 void *data;
6419 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 if (!PyUnicode_Check(unicode)) {
6422 PyErr_BadArgument();
6423 return NULL;
6424 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428 kind = PyUnicode_KIND(unicode);
6429 data = PyUnicode_DATA(unicode);
6430 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 if (kind == PyUnicode_1BYTE_KIND) {
6432 return PyBytes_FromStringAndSize(data, len);
6433 }
Victor Stinner0e368262011-11-10 20:12:49 +01006434
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6436 bytes, and 1 byte characters 4. */
6437 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006438
Victor Stinner62ec3312016-09-06 17:04:34 -07006439 if (len > PY_SSIZE_T_MAX / expandsize) {
6440 return PyErr_NoMemory();
6441 }
6442 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6443 if (repr == NULL) {
6444 return NULL;
6445 }
6446 if (len == 0) {
6447 return repr;
6448 }
6449
6450 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 for (pos = 0; pos < len; pos++) {
6452 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006453
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6455 if (ch < 0x100) {
6456 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006457 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6459 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 *p++ = '\\';
6461 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006462 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6463 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6464 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6465 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6468 else {
6469 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6470 *p++ = '\\';
6471 *p++ = 'U';
6472 *p++ = '0';
6473 *p++ = '0';
6474 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6477 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6478 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6479 *p++ = Py_hexdigits[ch & 15];
6480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482
Victor Stinner62ec3312016-09-06 17:04:34 -07006483 assert(p > PyBytes_AS_STRING(repr));
6484 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6485 return NULL;
6486 }
6487 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488}
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6492 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006495 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006497 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006498 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6499 Py_DECREF(tmp);
6500 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501}
6502
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006503/* --- Unicode Internal Codec ------------------------------------------- */
6504
Alexander Belopolsky40018472011-02-26 01:02:56 +00006505PyObject *
6506_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006507 Py_ssize_t size,
6508 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006509{
6510 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006511 Py_ssize_t startinpos;
6512 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006513 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006514 const char *end;
6515 const char *reason;
6516 PyObject *errorHandler = NULL;
6517 PyObject *exc = NULL;
6518
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006519 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006520 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006521 1))
6522 return NULL;
6523
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006524 if (size < 0) {
6525 PyErr_BadInternalCall();
6526 return NULL;
6527 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006528 if (size == 0)
6529 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006530
Victor Stinner8f674cc2013-04-17 23:02:17 +02006531 _PyUnicodeWriter_Init(&writer);
6532 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6533 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006535 }
6536 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537
Victor Stinner8f674cc2013-04-17 23:02:17 +02006538 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006539 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006540 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006541 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006542 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006543 endinpos = end-starts;
6544 reason = "truncated input";
6545 goto error;
6546 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006547 /* We copy the raw representation one byte at a time because the
6548 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 ((char *) &uch)[0] = s[0];
6550 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006551#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006552 ((char *) &uch)[2] = s[2];
6553 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006554#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006555 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006556#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006557 /* We have to sanity check the raw data, otherwise doom looms for
6558 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006559 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006560 endinpos = s - starts + Py_UNICODE_SIZE;
6561 reason = "illegal code point (> 0x10FFFF)";
6562 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006563 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006564#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006565 s += Py_UNICODE_SIZE;
6566#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006567 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006568 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006569 Py_UNICODE uch2;
6570 ((char *) &uch2)[0] = s[0];
6571 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006572 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006573 {
Victor Stinner551ac952011-11-29 22:58:13 +01006574 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006576 }
6577 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006578#endif
6579
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006580 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006581 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006582 continue;
6583
6584 error:
6585 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006586 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006587 errors, &errorHandler,
6588 "unicode_internal", reason,
6589 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006590 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006591 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 }
6593
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006596 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006597
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006599 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006600 Py_XDECREF(errorHandler);
6601 Py_XDECREF(exc);
6602 return NULL;
6603}
6604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605/* --- Latin-1 Codec ------------------------------------------------------ */
6606
Alexander Belopolsky40018472011-02-26 01:02:56 +00006607PyObject *
6608PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006609 Py_ssize_t size,
6610 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006613 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614}
6615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006617static void
6618make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006619 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006620 PyObject *unicode,
6621 Py_ssize_t startpos, Py_ssize_t endpos,
6622 const char *reason)
6623{
6624 if (*exceptionObject == NULL) {
6625 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006626 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006627 encoding, unicode, startpos, endpos, reason);
6628 }
6629 else {
6630 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6631 goto onError;
6632 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6633 goto onError;
6634 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6635 goto onError;
6636 return;
6637 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006638 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006639 }
6640}
6641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006642/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006643static void
6644raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006645 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006646 PyObject *unicode,
6647 Py_ssize_t startpos, Py_ssize_t endpos,
6648 const char *reason)
6649{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006650 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006651 encoding, unicode, startpos, endpos, reason);
6652 if (*exceptionObject != NULL)
6653 PyCodec_StrictErrors(*exceptionObject);
6654}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655
6656/* error handling callback helper:
6657 build arguments, call the callback and check the arguments,
6658 put the result into newpos and return the replacement string, which
6659 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006660static PyObject *
6661unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006662 PyObject **errorHandler,
6663 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006664 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006665 Py_ssize_t startpos, Py_ssize_t endpos,
6666 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006668 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 PyObject *restuple;
6671 PyObject *resunicode;
6672
6673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006675 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 }
6678
Benjamin Petersonbac79492012-01-14 13:34:47 -05006679 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006680 return NULL;
6681 len = PyUnicode_GET_LENGTH(unicode);
6682
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006683 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006688 restuple = PyObject_CallFunctionObjArgs(
6689 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006693 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 Py_DECREF(restuple);
6695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006697 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 &resunicode, newpos)) {
6699 Py_DECREF(restuple);
6700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006702 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6703 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6704 Py_DECREF(restuple);
6705 return NULL;
6706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006708 *newpos = len + *newpos;
6709 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006710 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 Py_DECREF(restuple);
6712 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 Py_INCREF(resunicode);
6715 Py_DECREF(restuple);
6716 return resunicode;
6717}
6718
Alexander Belopolsky40018472011-02-26 01:02:56 +00006719static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006721 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006722 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 /* input state */
6725 Py_ssize_t pos=0, size;
6726 int kind;
6727 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006728 /* pointer into the output */
6729 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006730 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6731 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006732 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006734 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006735 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006736 /* output object */
6737 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738
Benjamin Petersonbac79492012-01-14 13:34:47 -05006739 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 return NULL;
6741 size = PyUnicode_GET_LENGTH(unicode);
6742 kind = PyUnicode_KIND(unicode);
6743 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 /* allocate enough for a simple encoding without
6745 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006746 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006747 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006748
6749 _PyBytesWriter_Init(&writer);
6750 str = _PyBytesWriter_Alloc(&writer, size);
6751 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006752 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006753
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006754 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006755 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006758 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006760 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006762 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006764 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006767 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006769
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006770 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006772
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006773 /* Only overallocate the buffer if it's not the last write */
6774 writer.overallocate = (collend < size);
6775
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006777 if (error_handler == _Py_ERROR_UNKNOWN)
6778 error_handler = get_error_handler(errors);
6779
6780 switch (error_handler) {
6781 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006782 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006784
6785 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006786 memset(str, '?', collend - collstart);
6787 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006788 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006789 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 break;
Victor Stinner50149202015-09-22 00:26:54 +02006792
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006793 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006794 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006795 writer.min_size -= (collend - collstart);
6796 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006797 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006798 if (str == NULL)
6799 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006800 pos = collend;
6801 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006802
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006803 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006804 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006805 writer.min_size -= (collend - collstart);
6806 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006807 unicode, collstart, collend);
6808 if (str == NULL)
6809 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006810 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 break;
Victor Stinner50149202015-09-22 00:26:54 +02006812
Victor Stinnerc3713e92015-09-29 12:32:13 +02006813 case _Py_ERROR_SURROGATEESCAPE:
6814 for (i = collstart; i < collend; ++i) {
6815 ch = PyUnicode_READ(kind, data, i);
6816 if (ch < 0xdc80 || 0xdcff < ch) {
6817 /* Not a UTF-8b surrogate */
6818 break;
6819 }
6820 *str++ = (char)(ch - 0xdc00);
6821 ++pos;
6822 }
6823 if (i >= collend)
6824 break;
6825 collstart = pos;
6826 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006827 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006828
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006830 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6831 encoding, reason, unicode, &exc,
6832 collstart, collend, &newpos);
6833 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006835
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006836 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006837 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006838
Victor Stinner6bd525b2015-10-09 13:10:05 +02006839 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006840 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006841 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006842 PyBytes_AS_STRING(rep),
6843 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006844 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006845 else {
6846 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006847
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006850
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006851 if (limit == 256 ?
6852 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6853 !PyUnicode_IS_ASCII(rep))
6854 {
6855 /* Not all characters are smaller than limit */
6856 raise_encode_exception(&exc, encoding, unicode,
6857 collstart, collend, reason);
6858 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006860 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6861 str = _PyBytesWriter_WriteBytes(&writer, str,
6862 PyUnicode_DATA(rep),
6863 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 }
Miss Islington (bot)1e596d32018-08-19 16:17:53 -04006865 if (str == NULL)
6866 goto onError;
6867
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006869 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006870 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006871
6872 /* If overallocation was disabled, ensure that it was the last
6873 write. Otherwise, we missed an optimization */
6874 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006875 }
6876 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006877
Victor Stinner50149202015-09-22 00:26:54 +02006878 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006879 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006880 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006881
6882 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006883 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006884 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006885 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006886 Py_XDECREF(exc);
6887 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888}
6889
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006891PyObject *
6892PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006893 Py_ssize_t size,
6894 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006896 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006897 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006898 if (unicode == NULL)
6899 return NULL;
6900 result = unicode_encode_ucs1(unicode, errors, 256);
6901 Py_DECREF(unicode);
6902 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903}
6904
Alexander Belopolsky40018472011-02-26 01:02:56 +00006905PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006906_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907{
6908 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 PyErr_BadArgument();
6910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006912 if (PyUnicode_READY(unicode) == -1)
6913 return NULL;
6914 /* Fast path: if it is a one-byte string, construct
6915 bytes object directly. */
6916 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6917 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6918 PyUnicode_GET_LENGTH(unicode));
6919 /* Non-Latin-1 characters present. Defer to above function to
6920 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006921 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006922}
6923
6924PyObject*
6925PyUnicode_AsLatin1String(PyObject *unicode)
6926{
6927 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
6930/* --- 7-bit ASCII Codec -------------------------------------------------- */
6931
Alexander Belopolsky40018472011-02-26 01:02:56 +00006932PyObject *
6933PyUnicode_DecodeASCII(const char *s,
6934 Py_ssize_t size,
6935 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006938 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006939 int kind;
6940 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006941 Py_ssize_t startinpos;
6942 Py_ssize_t endinpos;
6943 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006945 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006947 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006950 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006951
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006953 if (size == 1 && (unsigned char)s[0] < 128)
6954 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006955
Victor Stinner8f674cc2013-04-17 23:02:17 +02006956 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006957 writer.min_length = size;
6958 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006959 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006961 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006963 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006964 writer.pos = outpos;
6965 if (writer.pos == size)
6966 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006967
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006968 s += writer.pos;
6969 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006971 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006973 PyUnicode_WRITE(kind, data, writer.pos, c);
6974 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006976 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006978
6979 /* byte outsize range 0x00..0x7f: call the error handler */
6980
6981 if (error_handler == _Py_ERROR_UNKNOWN)
6982 error_handler = get_error_handler(errors);
6983
6984 switch (error_handler)
6985 {
6986 case _Py_ERROR_REPLACE:
6987 case _Py_ERROR_SURROGATEESCAPE:
6988 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006989 but we may switch to UCS2 at the first write */
6990 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6991 goto onError;
6992 kind = writer.kind;
6993 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006994
6995 if (error_handler == _Py_ERROR_REPLACE)
6996 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6997 else
6998 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6999 writer.pos++;
7000 ++s;
7001 break;
7002
7003 case _Py_ERROR_IGNORE:
7004 ++s;
7005 break;
7006
7007 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 startinpos = s-starts;
7009 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007010 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007011 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 "ascii", "ordinal not in range(128)",
7013 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007016 kind = writer.kind;
7017 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007020 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007022 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007023
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007025 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007026 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007027 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 return NULL;
7029}
7030
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007032PyObject *
7033PyUnicode_EncodeASCII(const Py_UNICODE *p,
7034 Py_ssize_t size,
7035 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007037 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007038 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 if (unicode == NULL)
7040 return NULL;
7041 result = unicode_encode_ucs1(unicode, errors, 128);
7042 Py_DECREF(unicode);
7043 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044}
7045
Alexander Belopolsky40018472011-02-26 01:02:56 +00007046PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007047_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048{
7049 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007050 PyErr_BadArgument();
7051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007053 if (PyUnicode_READY(unicode) == -1)
7054 return NULL;
7055 /* Fast path: if it is an ASCII-only string, construct bytes object
7056 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007057 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007058 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7059 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007060 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007061}
7062
7063PyObject *
7064PyUnicode_AsASCIIString(PyObject *unicode)
7065{
7066 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067}
7068
Steve Dowercc16be82016-09-08 10:35:16 -07007069#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007070
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007071/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007072
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007073#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074#define NEED_RETRY
7075#endif
7076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077#ifndef WC_ERR_INVALID_CHARS
7078# define WC_ERR_INVALID_CHARS 0x0080
7079#endif
7080
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007081static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007082code_page_name(UINT code_page, PyObject **obj)
7083{
7084 *obj = NULL;
7085 if (code_page == CP_ACP)
7086 return "mbcs";
7087 if (code_page == CP_UTF7)
7088 return "CP_UTF7";
7089 if (code_page == CP_UTF8)
7090 return "CP_UTF8";
7091
7092 *obj = PyBytes_FromFormat("cp%u", code_page);
7093 if (*obj == NULL)
7094 return NULL;
7095 return PyBytes_AS_STRING(*obj);
7096}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097
Victor Stinner3a50e702011-10-18 21:21:00 +02007098static DWORD
7099decode_code_page_flags(UINT code_page)
7100{
7101 if (code_page == CP_UTF7) {
7102 /* The CP_UTF7 decoder only supports flags=0 */
7103 return 0;
7104 }
7105 else
7106 return MB_ERR_INVALID_CHARS;
7107}
7108
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 * Decode a byte string from a Windows code page into unicode object in strict
7111 * mode.
7112 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007113 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7114 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007116static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007117decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 const char *in,
7120 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121{
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007123 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125
7126 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 assert(insize > 0);
7128 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7129 if (outsize <= 0)
7130 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131
7132 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007134 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007135 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 if (*v == NULL)
7137 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139 }
7140 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007143 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007146 }
7147
7148 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7150 if (outsize <= 0)
7151 goto error;
7152 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154error:
7155 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7156 return -2;
7157 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007158 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007159}
7160
Victor Stinner3a50e702011-10-18 21:21:00 +02007161/*
7162 * Decode a byte string from a code page into unicode object with an error
7163 * handler.
7164 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007165 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 * UnicodeDecodeError exception and returns -1 on error.
7167 */
7168static int
7169decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007170 PyObject **v,
7171 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007172 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007173{
7174 const char *startin = in;
7175 const char *endin = in + size;
7176 const DWORD flags = decode_code_page_flags(code_page);
7177 /* Ideally, we should get reason from FormatMessage. This is the Windows
7178 2000 English version of the message. */
7179 const char *reason = "No mapping for the Unicode character exists "
7180 "in the target code page.";
7181 /* each step cannot decode more than 1 character, but a character can be
7182 represented as a surrogate pair */
7183 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007184 int insize;
7185 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 PyObject *errorHandler = NULL;
7187 PyObject *exc = NULL;
7188 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007189 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 DWORD err;
7191 int ret = -1;
7192
7193 assert(size > 0);
7194
7195 encoding = code_page_name(code_page, &encoding_obj);
7196 if (encoding == NULL)
7197 return -1;
7198
Victor Stinner7d00cc12014-03-17 23:08:06 +01007199 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7201 UnicodeDecodeError. */
7202 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7203 if (exc != NULL) {
7204 PyCodec_StrictErrors(exc);
7205 Py_CLEAR(exc);
7206 }
7207 goto error;
7208 }
7209
7210 if (*v == NULL) {
7211 /* Create unicode object */
7212 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7213 PyErr_NoMemory();
7214 goto error;
7215 }
Victor Stinnerab595942011-12-17 04:59:06 +01007216 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007217 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 if (*v == NULL)
7219 goto error;
7220 startout = PyUnicode_AS_UNICODE(*v);
7221 }
7222 else {
7223 /* Extend unicode object */
7224 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7225 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7226 PyErr_NoMemory();
7227 goto error;
7228 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007229 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 goto error;
7231 startout = PyUnicode_AS_UNICODE(*v) + n;
7232 }
7233
7234 /* Decode the byte string character per character */
7235 out = startout;
7236 while (in < endin)
7237 {
7238 /* Decode a character */
7239 insize = 1;
7240 do
7241 {
7242 outsize = MultiByteToWideChar(code_page, flags,
7243 in, insize,
7244 buffer, Py_ARRAY_LENGTH(buffer));
7245 if (outsize > 0)
7246 break;
7247 err = GetLastError();
7248 if (err != ERROR_NO_UNICODE_TRANSLATION
7249 && err != ERROR_INSUFFICIENT_BUFFER)
7250 {
7251 PyErr_SetFromWindowsErr(0);
7252 goto error;
7253 }
7254 insize++;
7255 }
7256 /* 4=maximum length of a UTF-8 sequence */
7257 while (insize <= 4 && (in + insize) <= endin);
7258
7259 if (outsize <= 0) {
7260 Py_ssize_t startinpos, endinpos, outpos;
7261
Victor Stinner7d00cc12014-03-17 23:08:06 +01007262 /* last character in partial decode? */
7263 if (in + insize >= endin && !final)
7264 break;
7265
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 startinpos = in - startin;
7267 endinpos = startinpos + 1;
7268 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007269 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 errors, &errorHandler,
7271 encoding, reason,
7272 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007273 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 {
7275 goto error;
7276 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007277 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 }
7279 else {
7280 in += insize;
7281 memcpy(out, buffer, outsize * sizeof(wchar_t));
7282 out += outsize;
7283 }
7284 }
7285
7286 /* write a NUL character at the end */
7287 *out = 0;
7288
7289 /* Extend unicode object */
7290 outsize = out - startout;
7291 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007292 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007294 /* (in - startin) <= size and size is an int */
7295 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007296
7297error:
7298 Py_XDECREF(encoding_obj);
7299 Py_XDECREF(errorHandler);
7300 Py_XDECREF(exc);
7301 return ret;
7302}
7303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304static PyObject *
7305decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007306 const char *s, Py_ssize_t size,
7307 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308{
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 PyObject *v = NULL;
7310 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 if (code_page < 0) {
7313 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7314 return NULL;
7315 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007316 if (size < 0) {
7317 PyErr_BadInternalCall();
7318 return NULL;
7319 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007320
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 do
7325 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 if (size > INT_MAX) {
7328 chunk_size = INT_MAX;
7329 final = 0;
7330 done = 0;
7331 }
7332 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007334 {
7335 chunk_size = (int)size;
7336 final = (consumed == NULL);
7337 done = 1;
7338 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007339
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 if (chunk_size == 0 && done) {
7341 if (v != NULL)
7342 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007343 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007345
Victor Stinner76a31a62011-11-04 00:05:13 +01007346 converted = decode_code_page_strict(code_page, &v,
7347 s, chunk_size);
7348 if (converted == -2)
7349 converted = decode_code_page_errors(code_page, &v,
7350 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007351 errors, final);
7352 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007353
7354 if (converted < 0) {
7355 Py_XDECREF(v);
7356 return NULL;
7357 }
7358
7359 if (consumed)
7360 *consumed += converted;
7361
7362 s += converted;
7363 size -= converted;
7364 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007365
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007366 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007367}
7368
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007370PyUnicode_DecodeCodePageStateful(int code_page,
7371 const char *s,
7372 Py_ssize_t size,
7373 const char *errors,
7374 Py_ssize_t *consumed)
7375{
7376 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7377}
7378
7379PyObject *
7380PyUnicode_DecodeMBCSStateful(const char *s,
7381 Py_ssize_t size,
7382 const char *errors,
7383 Py_ssize_t *consumed)
7384{
7385 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7386}
7387
7388PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007389PyUnicode_DecodeMBCS(const char *s,
7390 Py_ssize_t size,
7391 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007392{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007393 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7394}
7395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396static DWORD
7397encode_code_page_flags(UINT code_page, const char *errors)
7398{
7399 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007400 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 }
7402 else if (code_page == CP_UTF7) {
7403 /* CP_UTF7 only supports flags=0 */
7404 return 0;
7405 }
7406 else {
7407 if (errors != NULL && strcmp(errors, "replace") == 0)
7408 return 0;
7409 else
7410 return WC_NO_BEST_FIT_CHARS;
7411 }
7412}
7413
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007414/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 * Encode a Unicode string to a Windows code page into a byte string in strict
7416 * mode.
7417 *
7418 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007419 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007420 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007421static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007422encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007423 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425{
Victor Stinner554f3f02010-06-16 23:33:54 +00007426 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 BOOL *pusedDefaultChar = &usedDefaultChar;
7428 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007429 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 const DWORD flags = encode_code_page_flags(code_page, NULL);
7432 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007433 /* Create a substring so that we can get the UTF-16 representation
7434 of just the slice under consideration. */
7435 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436
Martin v. Löwis3d325192011-11-04 18:23:06 +01007437 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007438
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007440 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007442 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007443
Victor Stinner2fc507f2011-11-04 20:06:39 +01007444 substring = PyUnicode_Substring(unicode, offset, offset+len);
7445 if (substring == NULL)
7446 return -1;
7447 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7448 if (p == NULL) {
7449 Py_DECREF(substring);
7450 return -1;
7451 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007452 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007453
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007454 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007456 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 NULL, 0,
7458 NULL, pusedDefaultChar);
7459 if (outsize <= 0)
7460 goto error;
7461 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 if (pusedDefaultChar && *pusedDefaultChar) {
7463 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007466
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 if (*outbytes == NULL) {
7471 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475 }
7476 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 const Py_ssize_t n = PyBytes_Size(*outbytes);
7479 if (outsize > PY_SSIZE_T_MAX - n) {
7480 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007481 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7485 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007487 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489 }
7490
7491 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007493 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 out, outsize,
7495 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 if (outsize <= 0)
7498 goto error;
7499 if (pusedDefaultChar && *pusedDefaultChar)
7500 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007501 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007504 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7506 return -2;
7507 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007508 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007509}
7510
Victor Stinner3a50e702011-10-18 21:21:00 +02007511/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007512 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 * error handler.
7514 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007515 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 * -1 on other error.
7517 */
7518static int
7519encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007520 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007521 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007522{
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524 Py_ssize_t pos = unicode_offset;
7525 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 /* Ideally, we should get reason from FormatMessage. This is the Windows
7527 2000 English version of the message. */
7528 const char *reason = "invalid character";
7529 /* 4=maximum length of a UTF-8 sequence */
7530 char buffer[4];
7531 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7532 Py_ssize_t outsize;
7533 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 PyObject *errorHandler = NULL;
7535 PyObject *exc = NULL;
7536 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007537 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007538 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 PyObject *rep;
7540 int ret = -1;
7541
7542 assert(insize > 0);
7543
7544 encoding = code_page_name(code_page, &encoding_obj);
7545 if (encoding == NULL)
7546 return -1;
7547
7548 if (errors == NULL || strcmp(errors, "strict") == 0) {
7549 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7550 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007551 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 if (exc != NULL) {
7553 PyCodec_StrictErrors(exc);
7554 Py_DECREF(exc);
7555 }
7556 Py_XDECREF(encoding_obj);
7557 return -1;
7558 }
7559
7560 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7561 pusedDefaultChar = &usedDefaultChar;
7562 else
7563 pusedDefaultChar = NULL;
7564
7565 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7566 PyErr_NoMemory();
7567 goto error;
7568 }
7569 outsize = insize * Py_ARRAY_LENGTH(buffer);
7570
7571 if (*outbytes == NULL) {
7572 /* Create string object */
7573 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7574 if (*outbytes == NULL)
7575 goto error;
7576 out = PyBytes_AS_STRING(*outbytes);
7577 }
7578 else {
7579 /* Extend string object */
7580 Py_ssize_t n = PyBytes_Size(*outbytes);
7581 if (n > PY_SSIZE_T_MAX - outsize) {
7582 PyErr_NoMemory();
7583 goto error;
7584 }
7585 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7586 goto error;
7587 out = PyBytes_AS_STRING(*outbytes) + n;
7588 }
7589
7590 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007591 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007592 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007593 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7594 wchar_t chars[2];
7595 int charsize;
7596 if (ch < 0x10000) {
7597 chars[0] = (wchar_t)ch;
7598 charsize = 1;
7599 }
7600 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007601 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7602 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007603 charsize = 2;
7604 }
7605
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007607 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 buffer, Py_ARRAY_LENGTH(buffer),
7609 NULL, pusedDefaultChar);
7610 if (outsize > 0) {
7611 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7612 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007613 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 memcpy(out, buffer, outsize);
7615 out += outsize;
7616 continue;
7617 }
7618 }
7619 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7620 PyErr_SetFromWindowsErr(0);
7621 goto error;
7622 }
7623
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 rep = unicode_encode_call_errorhandler(
7625 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007626 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 if (rep == NULL)
7629 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007630 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007631
7632 if (PyBytes_Check(rep)) {
7633 outsize = PyBytes_GET_SIZE(rep);
7634 if (outsize != 1) {
7635 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7636 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7637 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7638 Py_DECREF(rep);
7639 goto error;
7640 }
7641 out = PyBytes_AS_STRING(*outbytes) + offset;
7642 }
7643 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7644 out += outsize;
7645 }
7646 else {
7647 Py_ssize_t i;
7648 enum PyUnicode_Kind kind;
7649 void *data;
7650
Benjamin Petersonbac79492012-01-14 13:34:47 -05007651 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 Py_DECREF(rep);
7653 goto error;
7654 }
7655
7656 outsize = PyUnicode_GET_LENGTH(rep);
7657 if (outsize != 1) {
7658 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7659 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7660 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7661 Py_DECREF(rep);
7662 goto error;
7663 }
7664 out = PyBytes_AS_STRING(*outbytes) + offset;
7665 }
7666 kind = PyUnicode_KIND(rep);
7667 data = PyUnicode_DATA(rep);
7668 for (i=0; i < outsize; i++) {
7669 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7670 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007671 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007672 encoding, unicode,
7673 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 "unable to encode error handler result to ASCII");
7675 Py_DECREF(rep);
7676 goto error;
7677 }
7678 *out = (unsigned char)ch;
7679 out++;
7680 }
7681 }
7682 Py_DECREF(rep);
7683 }
7684 /* write a NUL byte */
7685 *out = 0;
7686 outsize = out - PyBytes_AS_STRING(*outbytes);
7687 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7688 if (_PyBytes_Resize(outbytes, outsize) < 0)
7689 goto error;
7690 ret = 0;
7691
7692error:
7693 Py_XDECREF(encoding_obj);
7694 Py_XDECREF(errorHandler);
7695 Py_XDECREF(exc);
7696 return ret;
7697}
7698
Victor Stinner3a50e702011-10-18 21:21:00 +02007699static PyObject *
7700encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007701 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 const char *errors)
7703{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007704 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007706 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007707 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007708
Victor Stinner29dacf22015-01-26 16:41:32 +01007709 if (!PyUnicode_Check(unicode)) {
7710 PyErr_BadArgument();
7711 return NULL;
7712 }
7713
Benjamin Petersonbac79492012-01-14 13:34:47 -05007714 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007715 return NULL;
7716 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007717
Victor Stinner3a50e702011-10-18 21:21:00 +02007718 if (code_page < 0) {
7719 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7720 return NULL;
7721 }
7722
Martin v. Löwis3d325192011-11-04 18:23:06 +01007723 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007724 return PyBytes_FromStringAndSize(NULL, 0);
7725
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 offset = 0;
7727 do
7728 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007730 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 chunks. */
7732 if (len > INT_MAX/2) {
7733 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007734 done = 0;
7735 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007737#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007739 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 done = 1;
7741 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007742
Victor Stinner76a31a62011-11-04 00:05:13 +01007743 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 errors);
7746 if (ret == -2)
7747 ret = encode_code_page_errors(code_page, &outbytes,
7748 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007749 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007750 if (ret < 0) {
7751 Py_XDECREF(outbytes);
7752 return NULL;
7753 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007754
Victor Stinner7581cef2011-11-03 22:32:33 +01007755 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007756 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007757 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007758
Victor Stinner3a50e702011-10-18 21:21:00 +02007759 return outbytes;
7760}
7761
7762PyObject *
7763PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7764 Py_ssize_t size,
7765 const char *errors)
7766{
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007768 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 if (unicode == NULL)
7770 return NULL;
7771 res = encode_code_page(CP_ACP, unicode, errors);
7772 Py_DECREF(unicode);
7773 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007774}
7775
7776PyObject *
7777PyUnicode_EncodeCodePage(int code_page,
7778 PyObject *unicode,
7779 const char *errors)
7780{
Victor Stinner7581cef2011-11-03 22:32:33 +01007781 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007782}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007783
Alexander Belopolsky40018472011-02-26 01:02:56 +00007784PyObject *
7785PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007786{
Victor Stinner7581cef2011-11-03 22:32:33 +01007787 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007788}
7789
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007790#undef NEED_RETRY
7791
Steve Dowercc16be82016-09-08 10:35:16 -07007792#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007793
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794/* --- Character Mapping Codec -------------------------------------------- */
7795
Victor Stinnerfb161b12013-04-18 01:44:27 +02007796static int
7797charmap_decode_string(const char *s,
7798 Py_ssize_t size,
7799 PyObject *mapping,
7800 const char *errors,
7801 _PyUnicodeWriter *writer)
7802{
7803 const char *starts = s;
7804 const char *e;
7805 Py_ssize_t startinpos, endinpos;
7806 PyObject *errorHandler = NULL, *exc = NULL;
7807 Py_ssize_t maplen;
7808 enum PyUnicode_Kind mapkind;
7809 void *mapdata;
7810 Py_UCS4 x;
7811 unsigned char ch;
7812
7813 if (PyUnicode_READY(mapping) == -1)
7814 return -1;
7815
7816 maplen = PyUnicode_GET_LENGTH(mapping);
7817 mapdata = PyUnicode_DATA(mapping);
7818 mapkind = PyUnicode_KIND(mapping);
7819
7820 e = s + size;
7821
7822 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7823 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7824 * is disabled in encoding aliases, latin1 is preferred because
7825 * its implementation is faster. */
7826 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7827 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7828 Py_UCS4 maxchar = writer->maxchar;
7829
7830 assert (writer->kind == PyUnicode_1BYTE_KIND);
7831 while (s < e) {
7832 ch = *s;
7833 x = mapdata_ucs1[ch];
7834 if (x > maxchar) {
7835 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7836 goto onError;
7837 maxchar = writer->maxchar;
7838 outdata = (Py_UCS1 *)writer->data;
7839 }
7840 outdata[writer->pos] = x;
7841 writer->pos++;
7842 ++s;
7843 }
7844 return 0;
7845 }
7846
7847 while (s < e) {
7848 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7849 enum PyUnicode_Kind outkind = writer->kind;
7850 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7851 if (outkind == PyUnicode_1BYTE_KIND) {
7852 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7853 Py_UCS4 maxchar = writer->maxchar;
7854 while (s < e) {
7855 ch = *s;
7856 x = mapdata_ucs2[ch];
7857 if (x > maxchar)
7858 goto Error;
7859 outdata[writer->pos] = x;
7860 writer->pos++;
7861 ++s;
7862 }
7863 break;
7864 }
7865 else if (outkind == PyUnicode_2BYTE_KIND) {
7866 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7867 while (s < e) {
7868 ch = *s;
7869 x = mapdata_ucs2[ch];
7870 if (x == 0xFFFE)
7871 goto Error;
7872 outdata[writer->pos] = x;
7873 writer->pos++;
7874 ++s;
7875 }
7876 break;
7877 }
7878 }
7879 ch = *s;
7880
7881 if (ch < maplen)
7882 x = PyUnicode_READ(mapkind, mapdata, ch);
7883 else
7884 x = 0xfffe; /* invalid value */
7885Error:
7886 if (x == 0xfffe)
7887 {
7888 /* undefined mapping */
7889 startinpos = s-starts;
7890 endinpos = startinpos+1;
7891 if (unicode_decode_call_errorhandler_writer(
7892 errors, &errorHandler,
7893 "charmap", "character maps to <undefined>",
7894 &starts, &e, &startinpos, &endinpos, &exc, &s,
7895 writer)) {
7896 goto onError;
7897 }
7898 continue;
7899 }
7900
7901 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7902 goto onError;
7903 ++s;
7904 }
7905 Py_XDECREF(errorHandler);
7906 Py_XDECREF(exc);
7907 return 0;
7908
7909onError:
7910 Py_XDECREF(errorHandler);
7911 Py_XDECREF(exc);
7912 return -1;
7913}
7914
7915static int
7916charmap_decode_mapping(const char *s,
7917 Py_ssize_t size,
7918 PyObject *mapping,
7919 const char *errors,
7920 _PyUnicodeWriter *writer)
7921{
7922 const char *starts = s;
7923 const char *e;
7924 Py_ssize_t startinpos, endinpos;
7925 PyObject *errorHandler = NULL, *exc = NULL;
7926 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007927 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007928
7929 e = s + size;
7930
7931 while (s < e) {
7932 ch = *s;
7933
7934 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7935 key = PyLong_FromLong((long)ch);
7936 if (key == NULL)
7937 goto onError;
7938
7939 item = PyObject_GetItem(mapping, key);
7940 Py_DECREF(key);
7941 if (item == NULL) {
7942 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7943 /* No mapping found means: mapping is undefined. */
7944 PyErr_Clear();
7945 goto Undefined;
7946 } else
7947 goto onError;
7948 }
7949
7950 /* Apply mapping */
7951 if (item == Py_None)
7952 goto Undefined;
7953 if (PyLong_Check(item)) {
7954 long value = PyLong_AS_LONG(item);
7955 if (value == 0xFFFE)
7956 goto Undefined;
7957 if (value < 0 || value > MAX_UNICODE) {
7958 PyErr_Format(PyExc_TypeError,
7959 "character mapping must be in range(0x%lx)",
7960 (unsigned long)MAX_UNICODE + 1);
7961 goto onError;
7962 }
7963
7964 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7965 goto onError;
7966 }
7967 else if (PyUnicode_Check(item)) {
7968 if (PyUnicode_READY(item) == -1)
7969 goto onError;
7970 if (PyUnicode_GET_LENGTH(item) == 1) {
7971 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7972 if (value == 0xFFFE)
7973 goto Undefined;
7974 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7975 goto onError;
7976 }
7977 else {
7978 writer->overallocate = 1;
7979 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7980 goto onError;
7981 }
7982 }
7983 else {
7984 /* wrong return value */
7985 PyErr_SetString(PyExc_TypeError,
7986 "character mapping must return integer, None or str");
7987 goto onError;
7988 }
7989 Py_CLEAR(item);
7990 ++s;
7991 continue;
7992
7993Undefined:
7994 /* undefined mapping */
7995 Py_CLEAR(item);
7996 startinpos = s-starts;
7997 endinpos = startinpos+1;
7998 if (unicode_decode_call_errorhandler_writer(
7999 errors, &errorHandler,
8000 "charmap", "character maps to <undefined>",
8001 &starts, &e, &startinpos, &endinpos, &exc, &s,
8002 writer)) {
8003 goto onError;
8004 }
8005 }
8006 Py_XDECREF(errorHandler);
8007 Py_XDECREF(exc);
8008 return 0;
8009
8010onError:
8011 Py_XDECREF(item);
8012 Py_XDECREF(errorHandler);
8013 Py_XDECREF(exc);
8014 return -1;
8015}
8016
Alexander Belopolsky40018472011-02-26 01:02:56 +00008017PyObject *
8018PyUnicode_DecodeCharmap(const char *s,
8019 Py_ssize_t size,
8020 PyObject *mapping,
8021 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008023 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008024
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 /* Default to Latin-1 */
8026 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008030 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008031 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008032 writer.min_length = size;
8033 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008035
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008036 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008037 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8038 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008039 }
8040 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008041 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8042 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008044 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008045
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008047 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 return NULL;
8049}
8050
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051/* Charmap encoding: the lookup table */
8052
Alexander Belopolsky40018472011-02-26 01:02:56 +00008053struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 PyObject_HEAD
8055 unsigned char level1[32];
8056 int count2, count3;
8057 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058};
8059
8060static PyObject*
8061encoding_map_size(PyObject *obj, PyObject* args)
8062{
8063 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008064 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066}
8067
8068static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008069 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 PyDoc_STR("Return the size (in bytes) of this object") },
8071 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072};
8073
8074static void
8075encoding_map_dealloc(PyObject* o)
8076{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008077 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078}
8079
8080static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 "EncodingMap", /*tp_name*/
8083 sizeof(struct encoding_map), /*tp_basicsize*/
8084 0, /*tp_itemsize*/
8085 /* methods */
8086 encoding_map_dealloc, /*tp_dealloc*/
8087 0, /*tp_print*/
8088 0, /*tp_getattr*/
8089 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008090 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 0, /*tp_repr*/
8092 0, /*tp_as_number*/
8093 0, /*tp_as_sequence*/
8094 0, /*tp_as_mapping*/
8095 0, /*tp_hash*/
8096 0, /*tp_call*/
8097 0, /*tp_str*/
8098 0, /*tp_getattro*/
8099 0, /*tp_setattro*/
8100 0, /*tp_as_buffer*/
8101 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8102 0, /*tp_doc*/
8103 0, /*tp_traverse*/
8104 0, /*tp_clear*/
8105 0, /*tp_richcompare*/
8106 0, /*tp_weaklistoffset*/
8107 0, /*tp_iter*/
8108 0, /*tp_iternext*/
8109 encoding_map_methods, /*tp_methods*/
8110 0, /*tp_members*/
8111 0, /*tp_getset*/
8112 0, /*tp_base*/
8113 0, /*tp_dict*/
8114 0, /*tp_descr_get*/
8115 0, /*tp_descr_set*/
8116 0, /*tp_dictoffset*/
8117 0, /*tp_init*/
8118 0, /*tp_alloc*/
8119 0, /*tp_new*/
8120 0, /*tp_free*/
8121 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122};
8123
8124PyObject*
8125PyUnicode_BuildEncodingMap(PyObject* string)
8126{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008127 PyObject *result;
8128 struct encoding_map *mresult;
8129 int i;
8130 int need_dict = 0;
8131 unsigned char level1[32];
8132 unsigned char level2[512];
8133 unsigned char *mlevel1, *mlevel2, *mlevel3;
8134 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 int kind;
8136 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008137 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008140 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 PyErr_BadArgument();
8142 return NULL;
8143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 kind = PyUnicode_KIND(string);
8145 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008146 length = PyUnicode_GET_LENGTH(string);
8147 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 memset(level1, 0xFF, sizeof level1);
8149 memset(level2, 0xFF, sizeof level2);
8150
8151 /* If there isn't a one-to-one mapping of NULL to \0,
8152 or if there are non-BMP characters, we need to use
8153 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008156 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 ch = PyUnicode_READ(kind, data, i);
8159 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 need_dict = 1;
8161 break;
8162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164 /* unmapped character */
8165 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 l1 = ch >> 11;
8167 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008168 if (level1[l1] == 0xFF)
8169 level1[l1] = count2++;
8170 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 }
8173
8174 if (count2 >= 0xFF || count3 >= 0xFF)
8175 need_dict = 1;
8176
8177 if (need_dict) {
8178 PyObject *result = PyDict_New();
8179 PyObject *key, *value;
8180 if (!result)
8181 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008182 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008184 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 if (!key || !value)
8186 goto failed1;
8187 if (PyDict_SetItem(result, key, value) == -1)
8188 goto failed1;
8189 Py_DECREF(key);
8190 Py_DECREF(value);
8191 }
8192 return result;
8193 failed1:
8194 Py_XDECREF(key);
8195 Py_XDECREF(value);
8196 Py_DECREF(result);
8197 return NULL;
8198 }
8199
8200 /* Create a three-level trie */
8201 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8202 16*count2 + 128*count3 - 1);
8203 if (!result)
8204 return PyErr_NoMemory();
8205 PyObject_Init(result, &EncodingMapType);
8206 mresult = (struct encoding_map*)result;
8207 mresult->count2 = count2;
8208 mresult->count3 = count3;
8209 mlevel1 = mresult->level1;
8210 mlevel2 = mresult->level23;
8211 mlevel3 = mresult->level23 + 16*count2;
8212 memcpy(mlevel1, level1, 32);
8213 memset(mlevel2, 0xFF, 16*count2);
8214 memset(mlevel3, 0, 128*count3);
8215 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008216 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008218 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8219 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 /* unmapped character */
8221 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008222 o1 = ch>>11;
8223 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224 i2 = 16*mlevel1[o1] + o2;
8225 if (mlevel2[i2] == 0xFF)
8226 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008227 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228 i3 = 128*mlevel2[i2] + o3;
8229 mlevel3[i3] = i;
8230 }
8231 return result;
8232}
8233
8234static int
Victor Stinner22168992011-11-20 17:09:18 +01008235encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236{
8237 struct encoding_map *map = (struct encoding_map*)mapping;
8238 int l1 = c>>11;
8239 int l2 = (c>>7) & 0xF;
8240 int l3 = c & 0x7F;
8241 int i;
8242
Victor Stinner22168992011-11-20 17:09:18 +01008243 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245 if (c == 0)
8246 return 0;
8247 /* level 1*/
8248 i = map->level1[l1];
8249 if (i == 0xFF) {
8250 return -1;
8251 }
8252 /* level 2*/
8253 i = map->level23[16*i+l2];
8254 if (i == 0xFF) {
8255 return -1;
8256 }
8257 /* level 3 */
8258 i = map->level23[16*map->count2 + 128*i + l3];
8259 if (i == 0) {
8260 return -1;
8261 }
8262 return i;
8263}
8264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265/* Lookup the character ch in the mapping. If the character
8266 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008267 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008269charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Christian Heimes217cfd12007-12-02 14:31:20 +00008271 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 PyObject *x;
8273
8274 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 x = PyObject_GetItem(mapping, w);
8277 Py_DECREF(w);
8278 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8280 /* No mapping found means: mapping is undefined. */
8281 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008282 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 } else
8284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008286 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008288 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 long value = PyLong_AS_LONG(x);
8290 if (value < 0 || value > 255) {
8291 PyErr_SetString(PyExc_TypeError,
8292 "character mapping must be in range(256)");
8293 Py_DECREF(x);
8294 return NULL;
8295 }
8296 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008298 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 /* wrong return value */
8302 PyErr_Format(PyExc_TypeError,
8303 "character mapping must return integer, bytes or None, not %.400s",
8304 x->ob_type->tp_name);
8305 Py_DECREF(x);
8306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
8308}
8309
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008311charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008313 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8314 /* exponentially overallocate to minimize reallocations */
8315 if (requiredsize < 2*outsize)
8316 requiredsize = 2*outsize;
8317 if (_PyBytes_Resize(outobj, requiredsize))
8318 return -1;
8319 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320}
8321
Benjamin Peterson14339b62009-01-31 16:36:08 +00008322typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008326 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327 space is available. Return a new reference to the object that
8328 was put in the output buffer, or Py_None, if the mapping was undefined
8329 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008330 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008332charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 PyObject *rep;
8336 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008337 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338
Christian Heimes90aa7642007-12-19 02:45:37 +00008339 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 if (res == -1)
8343 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 if (outsize<requiredsize)
8345 if (charmapencode_resize(outobj, outpos, requiredsize))
8346 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008347 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 outstart[(*outpos)++] = (char)res;
8349 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 }
8351
8352 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 Py_DECREF(rep);
8357 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 if (PyLong_Check(rep)) {
8360 Py_ssize_t requiredsize = *outpos+1;
8361 if (outsize<requiredsize)
8362 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8363 Py_DECREF(rep);
8364 return enc_EXCEPTION;
8365 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008366 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 else {
8370 const char *repchars = PyBytes_AS_STRING(rep);
8371 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8372 Py_ssize_t requiredsize = *outpos+repsize;
8373 if (outsize<requiredsize)
8374 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8375 Py_DECREF(rep);
8376 return enc_EXCEPTION;
8377 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008378 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 memcpy(outstart + *outpos, repchars, repsize);
8380 *outpos += repsize;
8381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 Py_DECREF(rep);
8384 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385}
8386
8387/* handle an error in PyUnicode_EncodeCharmap
8388 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008389static int
8390charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008391 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008393 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008394 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395{
8396 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008397 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008399 enum PyUnicode_Kind kind;
8400 void *data;
8401 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008403 Py_ssize_t collstartpos = *inpos;
8404 Py_ssize_t collendpos = *inpos+1;
8405 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008406 const char *encoding = "charmap";
8407 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008408 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008409 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008410 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411
Benjamin Petersonbac79492012-01-14 13:34:47 -05008412 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008413 return -1;
8414 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 /* find all unencodable characters */
8416 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008417 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008418 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008420 val = encoding_map_lookup(ch, mapping);
8421 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 break;
8423 ++collendpos;
8424 continue;
8425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008427 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8428 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 if (rep==NULL)
8430 return -1;
8431 else if (rep!=Py_None) {
8432 Py_DECREF(rep);
8433 break;
8434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 }
8438 /* cache callback name lookup
8439 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008440 if (*error_handler == _Py_ERROR_UNKNOWN)
8441 *error_handler = get_error_handler(errors);
8442
8443 switch (*error_handler) {
8444 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008445 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008447
8448 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008449 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 x = charmapencode_output('?', mapping, res, respos);
8451 if (x==enc_EXCEPTION) {
8452 return -1;
8453 }
8454 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008455 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
8457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 }
8459 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008460 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008461 *inpos = collendpos;
8462 break;
Victor Stinner50149202015-09-22 00:26:54 +02008463
8464 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 /* generate replacement (temporarily (mis)uses p) */
8466 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 char buffer[2+29+1+1];
8468 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008469 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 for (cp = buffer; *cp; ++cp) {
8471 x = charmapencode_output(*cp, mapping, res, respos);
8472 if (x==enc_EXCEPTION)
8473 return -1;
8474 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008475 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 return -1;
8477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 }
8479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 *inpos = collendpos;
8481 break;
Victor Stinner50149202015-09-22 00:26:54 +02008482
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 default:
Victor Stinner50149202015-09-22 00:26:54 +02008484 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008485 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008489 if (PyBytes_Check(repunicode)) {
8490 /* Directly copy bytes result to output. */
8491 Py_ssize_t outsize = PyBytes_Size(*res);
8492 Py_ssize_t requiredsize;
8493 repsize = PyBytes_Size(repunicode);
8494 requiredsize = *respos + repsize;
8495 if (requiredsize > outsize)
8496 /* Make room for all additional bytes. */
8497 if (charmapencode_resize(res, respos, requiredsize)) {
8498 Py_DECREF(repunicode);
8499 return -1;
8500 }
8501 memcpy(PyBytes_AsString(*res) + *respos,
8502 PyBytes_AsString(repunicode), repsize);
8503 *respos += repsize;
8504 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008505 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008506 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008509 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008510 Py_DECREF(repunicode);
8511 return -1;
8512 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008513 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008514 data = PyUnicode_DATA(repunicode);
8515 kind = PyUnicode_KIND(repunicode);
8516 for (index = 0; index < repsize; index++) {
8517 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8518 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008520 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return -1;
8522 }
8523 else if (x==enc_FAILED) {
8524 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008525 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 return -1;
8527 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008528 }
8529 *inpos = newpos;
8530 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
8532 return 0;
8533}
8534
Alexander Belopolsky40018472011-02-26 01:02:56 +00008535PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536_PyUnicode_EncodeCharmap(PyObject *unicode,
8537 PyObject *mapping,
8538 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 /* output object */
8541 PyObject *res = NULL;
8542 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008544 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008546 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008547 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008549 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008550 void *data;
8551 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
Benjamin Petersonbac79492012-01-14 13:34:47 -05008553 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 return NULL;
8555 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008556 data = PyUnicode_DATA(unicode);
8557 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 /* Default to Latin-1 */
8560 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 /* allocate enough for a simple encoding without
8564 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008565 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 if (res == NULL)
8567 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008568 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008572 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008574 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 if (x==enc_EXCEPTION) /* error */
8576 goto onError;
8577 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008578 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008580 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 &res, &respos)) {
8582 goto onError;
8583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 else
8586 /* done with this character => adjust input position */
8587 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008591 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008592 if (_PyBytes_Resize(&res, respos) < 0)
8593 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008596 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 return res;
8598
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 Py_XDECREF(res);
8601 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008602 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 return NULL;
8604}
8605
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606/* Deprecated */
8607PyObject *
8608PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8609 Py_ssize_t size,
8610 PyObject *mapping,
8611 const char *errors)
8612{
8613 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008614 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008615 if (unicode == NULL)
8616 return NULL;
8617 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8618 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008619 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008620}
8621
Alexander Belopolsky40018472011-02-26 01:02:56 +00008622PyObject *
8623PyUnicode_AsCharmapString(PyObject *unicode,
8624 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625{
8626 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 PyErr_BadArgument();
8628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008630 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631}
8632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008634static void
8635make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637 Py_ssize_t startpos, Py_ssize_t endpos,
8638 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 *exceptionObject = _PyUnicodeTranslateError_Create(
8642 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 }
8644 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8646 goto onError;
8647 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8648 goto onError;
8649 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8650 goto onError;
8651 return;
8652 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008653 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 }
8655}
8656
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657/* error handling callback helper:
8658 build arguments, call the callback and check the arguments,
8659 put the result into newpos and return the replacement string, which
8660 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008661static PyObject *
8662unicode_translate_call_errorhandler(const char *errors,
8663 PyObject **errorHandler,
8664 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008666 Py_ssize_t startpos, Py_ssize_t endpos,
8667 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008669 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008671 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 PyObject *restuple;
8673 PyObject *resunicode;
8674
8675 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 }
8680
8681 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008686 restuple = PyObject_CallFunctionObjArgs(
8687 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008691 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 Py_DECREF(restuple);
8693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008695 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 &resunicode, &i_newpos)) {
8697 Py_DECREF(restuple);
8698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008700 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008702 else
8703 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008705 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 Py_DECREF(restuple);
8707 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 Py_INCREF(resunicode);
8710 Py_DECREF(restuple);
8711 return resunicode;
8712}
8713
8714/* Lookup the character ch in the mapping and put the result in result,
8715 which must be decrefed by the caller.
8716 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008717static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719{
Christian Heimes217cfd12007-12-02 14:31:20 +00008720 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 PyObject *x;
8722
8723 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725 x = PyObject_GetItem(mapping, w);
8726 Py_DECREF(w);
8727 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8729 /* No mapping found means: use 1:1 mapping. */
8730 PyErr_Clear();
8731 *result = NULL;
8732 return 0;
8733 } else
8734 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 }
8736 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 *result = x;
8738 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008740 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008742 if (value < 0 || value > MAX_UNICODE) {
8743 PyErr_Format(PyExc_ValueError,
8744 "character mapping must be in range(0x%x)",
8745 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 Py_DECREF(x);
8747 return -1;
8748 }
8749 *result = x;
8750 return 0;
8751 }
8752 else if (PyUnicode_Check(x)) {
8753 *result = x;
8754 return 0;
8755 }
8756 else {
8757 /* wrong return value */
8758 PyErr_SetString(PyExc_TypeError,
8759 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008760 Py_DECREF(x);
8761 return -1;
8762 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763}
Victor Stinner1194ea02014-04-04 19:37:40 +02008764
8765/* lookup the character, write the result into the writer.
8766 Return 1 if the result was written into the writer, return 0 if the mapping
8767 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008768static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008769charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8770 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008771{
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 PyObject *item;
8773
8774 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008776
8777 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008779 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008782 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008784
8785 if (item == Py_None) {
8786 Py_DECREF(item);
8787 return 0;
8788 }
8789
8790 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008791 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8792 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8793 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008794 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8795 Py_DECREF(item);
8796 return -1;
8797 }
8798 Py_DECREF(item);
8799 return 1;
8800 }
8801
8802 if (!PyUnicode_Check(item)) {
8803 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008805 }
8806
8807 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8808 Py_DECREF(item);
8809 return -1;
8810 }
8811
8812 Py_DECREF(item);
8813 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814}
8815
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816static int
8817unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8818 Py_UCS1 *translate)
8819{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008820 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821 int ret = 0;
8822
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 if (charmaptranslate_lookup(ch, mapping, &item)) {
8824 return -1;
8825 }
8826
8827 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008828 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008829 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008831 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008832 /* not found => default to 1:1 mapping */
8833 translate[ch] = ch;
8834 return 1;
8835 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008836 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008837 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008838 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8839 used it */
8840 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 /* invalid character or character outside ASCII:
8842 skip the fast translate */
8843 goto exit;
8844 }
8845 translate[ch] = (Py_UCS1)replace;
8846 }
8847 else if (PyUnicode_Check(item)) {
8848 Py_UCS4 replace;
8849
8850 if (PyUnicode_READY(item) == -1) {
8851 Py_DECREF(item);
8852 return -1;
8853 }
8854 if (PyUnicode_GET_LENGTH(item) != 1)
8855 goto exit;
8856
8857 replace = PyUnicode_READ_CHAR(item, 0);
8858 if (replace > 127)
8859 goto exit;
8860 translate[ch] = (Py_UCS1)replace;
8861 }
8862 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008863 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008864 goto exit;
8865 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 ret = 1;
8867
Benjamin Peterson1365de72014-04-07 20:15:41 -04008868 exit:
8869 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870 return ret;
8871}
8872
8873/* Fast path for ascii => ascii translation. Return 1 if the whole string
8874 was translated into writer, return 0 if the input string was partially
8875 translated into writer, raise an exception and return -1 on error. */
8876static int
8877unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008878 _PyUnicodeWriter *writer, int ignore,
8879 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008880{
Victor Stinner872b2912014-04-05 14:27:07 +02008881 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882 Py_ssize_t len;
8883 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008884 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008885
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886 len = PyUnicode_GET_LENGTH(input);
8887
Victor Stinner872b2912014-04-05 14:27:07 +02008888 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008889
8890 in = PyUnicode_1BYTE_DATA(input);
8891 end = in + len;
8892
8893 assert(PyUnicode_IS_ASCII(writer->buffer));
8894 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8895 out = PyUnicode_1BYTE_DATA(writer->buffer);
8896
Victor Stinner872b2912014-04-05 14:27:07 +02008897 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008899 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008901 int translate = unicode_fast_translate_lookup(mapping, ch,
8902 ascii_table);
8903 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008905 if (translate == 0)
8906 goto exit;
8907 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 }
Victor Stinner872b2912014-04-05 14:27:07 +02008909 if (ch2 == 0xfe) {
8910 if (ignore)
8911 continue;
8912 goto exit;
8913 }
8914 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008916 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 }
Victor Stinner872b2912014-04-05 14:27:07 +02008918 res = 1;
8919
8920exit:
8921 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008922 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008923 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008924}
8925
Victor Stinner3222da22015-10-01 22:07:32 +02008926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927_PyUnicode_TranslateCharmap(PyObject *input,
8928 PyObject *mapping,
8929 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008932 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 Py_ssize_t size, i;
8934 int kind;
8935 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008936 _PyUnicodeWriter writer;
8937 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008938 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008939 PyObject *errorHandler = NULL;
8940 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008941 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008942 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008943
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 PyErr_BadArgument();
8946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 if (PyUnicode_READY(input) == -1)
8950 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008951 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 kind = PyUnicode_KIND(input);
8953 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008955 if (size == 0)
8956 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008958 /* allocate enough for a simple 1:1 translation without
8959 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008960 _PyUnicodeWriter_Init(&writer);
8961 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963
Victor Stinner872b2912014-04-05 14:27:07 +02008964 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8965
Victor Stinner33798672016-03-01 21:59:58 +01008966 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008967 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008968 if (PyUnicode_IS_ASCII(input)) {
8969 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8970 if (res < 0) {
8971 _PyUnicodeWriter_Dealloc(&writer);
8972 return NULL;
8973 }
8974 if (res == 1)
8975 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008976 }
Victor Stinner33798672016-03-01 21:59:58 +01008977 else {
8978 i = 0;
8979 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008983 int translate;
8984 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8985 Py_ssize_t newpos;
8986 /* startpos for collecting untranslatable chars */
8987 Py_ssize_t collstart;
8988 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 ch = PyUnicode_READ(kind, data, i);
8992 translate = charmaptranslate_output(ch, mapping, &writer);
8993 if (translate < 0)
8994 goto onError;
8995
8996 if (translate != 0) {
8997 /* it worked => adjust input pointer */
8998 ++i;
8999 continue;
9000 }
9001
9002 /* untranslatable character */
9003 collstart = i;
9004 collend = i+1;
9005
9006 /* find all untranslatable characters */
9007 while (collend < size) {
9008 PyObject *x;
9009 ch = PyUnicode_READ(kind, data, collend);
9010 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009011 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 Py_XDECREF(x);
9013 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009015 ++collend;
9016 }
9017
9018 if (ignore) {
9019 i = collend;
9020 }
9021 else {
9022 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9023 reason, input, &exc,
9024 collstart, collend, &newpos);
9025 if (repunicode == NULL)
9026 goto onError;
9027 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 Py_DECREF(repunicode);
9032 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009033 }
9034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009035 Py_XDECREF(exc);
9036 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009041 Py_XDECREF(exc);
9042 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 return NULL;
9044}
9045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046/* Deprecated. Use PyUnicode_Translate instead. */
9047PyObject *
9048PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9049 Py_ssize_t size,
9050 PyObject *mapping,
9051 const char *errors)
9052{
Christian Heimes5f520f42012-09-11 14:03:25 +02009053 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009054 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 if (!unicode)
9056 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009057 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9058 Py_DECREF(unicode);
9059 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060}
9061
Alexander Belopolsky40018472011-02-26 01:02:56 +00009062PyObject *
9063PyUnicode_Translate(PyObject *str,
9064 PyObject *mapping,
9065 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009067 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009068 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009069 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070}
Tim Petersced69f82003-09-16 20:30:58 +00009071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072PyObject *
9073_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9074{
9075 if (!PyUnicode_Check(unicode)) {
9076 PyErr_BadInternalCall();
9077 return NULL;
9078 }
9079 if (PyUnicode_READY(unicode) == -1)
9080 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009081 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 /* If the string is already ASCII, just return the same string */
9083 Py_INCREF(unicode);
9084 return unicode;
9085 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009086
9087 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9088 PyObject *result = PyUnicode_New(len, 127);
9089 if (result == NULL) {
9090 return NULL;
9091 }
9092
9093 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9094 int kind = PyUnicode_KIND(unicode);
9095 const void *data = PyUnicode_DATA(unicode);
9096 Py_ssize_t i;
9097 for (i = 0; i < len; ++i) {
9098 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9099 if (ch < 127) {
9100 out[i] = ch;
9101 }
9102 else if (Py_UNICODE_ISSPACE(ch)) {
9103 out[i] = ' ';
9104 }
9105 else {
9106 int decimal = Py_UNICODE_TODECIMAL(ch);
9107 if (decimal < 0) {
9108 out[i] = '?';
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009109 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009110 _PyUnicode_LENGTH(result) = i + 1;
9111 break;
9112 }
9113 out[i] = '0' + decimal;
9114 }
9115 }
9116
Miss Islington (bot)c7214722018-07-13 20:58:12 -07009117 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009118 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119}
9120
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009121PyObject *
9122PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9123 Py_ssize_t length)
9124{
Victor Stinnerf0124502011-11-21 23:12:56 +01009125 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009126 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009127 Py_UCS4 maxchar;
9128 enum PyUnicode_Kind kind;
9129 void *data;
9130
Victor Stinner99d7ad02012-02-22 13:37:39 +01009131 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009132 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009133 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009134 if (ch > 127) {
9135 int decimal = Py_UNICODE_TODECIMAL(ch);
9136 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009137 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009138 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009139 }
9140 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009141
9142 /* Copy to a new string */
9143 decimal = PyUnicode_New(length, maxchar);
9144 if (decimal == NULL)
9145 return decimal;
9146 kind = PyUnicode_KIND(decimal);
9147 data = PyUnicode_DATA(decimal);
9148 /* Iterate over code points */
9149 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009150 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009151 if (ch > 127) {
9152 int decimal = Py_UNICODE_TODECIMAL(ch);
9153 if (decimal >= 0)
9154 ch = '0' + decimal;
9155 }
9156 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009158 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009159}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009160/* --- Decimal Encoder ---------------------------------------------------- */
9161
Alexander Belopolsky40018472011-02-26 01:02:56 +00009162int
9163PyUnicode_EncodeDecimal(Py_UNICODE *s,
9164 Py_ssize_t length,
9165 char *output,
9166 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009167{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009168 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009169 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009170 enum PyUnicode_Kind kind;
9171 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009172
9173 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 PyErr_BadArgument();
9175 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009176 }
9177
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009178 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 if (unicode == NULL)
9180 return -1;
9181
Victor Stinner42bf7752011-11-21 22:52:58 +01009182 kind = PyUnicode_KIND(unicode);
9183 data = PyUnicode_DATA(unicode);
9184
Victor Stinnerb84d7232011-11-22 01:50:07 +01009185 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009186 PyObject *exc;
9187 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009189 Py_ssize_t startpos;
9190
9191 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009192
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009195 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 decimal = Py_UNICODE_TODECIMAL(ch);
9199 if (decimal >= 0) {
9200 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009201 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 continue;
9203 }
9204 if (0 < ch && ch < 256) {
9205 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009206 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 continue;
9208 }
Victor Stinner6345be92011-11-25 20:09:01 +01009209
Victor Stinner42bf7752011-11-21 22:52:58 +01009210 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009211 exc = NULL;
9212 raise_encode_exception(&exc, "decimal", unicode,
9213 startpos, startpos+1,
9214 "invalid decimal Unicode string");
9215 Py_XDECREF(exc);
9216 Py_DECREF(unicode);
9217 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009218 }
9219 /* 0-terminate the output string */
9220 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009221 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009222 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009223}
9224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225/* --- Helpers ------------------------------------------------------------ */
9226
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009227/* helper macro to fixup start/end slice values */
9228#define ADJUST_INDICES(start, end, len) \
9229 if (end > len) \
9230 end = len; \
9231 else if (end < 0) { \
9232 end += len; \
9233 if (end < 0) \
9234 end = 0; \
9235 } \
9236 if (start < 0) { \
9237 start += len; \
9238 if (start < 0) \
9239 start = 0; \
9240 }
9241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009243any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009245 Py_ssize_t end,
9246 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009248 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 void *buf1, *buf2;
9250 Py_ssize_t len1, len2, result;
9251
9252 kind1 = PyUnicode_KIND(s1);
9253 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009254 if (kind1 < kind2)
9255 return -1;
9256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 len1 = PyUnicode_GET_LENGTH(s1);
9258 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009259 ADJUST_INDICES(start, end, len1);
9260 if (end - start < len2)
9261 return -1;
9262
9263 buf1 = PyUnicode_DATA(s1);
9264 buf2 = PyUnicode_DATA(s2);
9265 if (len2 == 1) {
9266 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9267 result = findchar((const char *)buf1 + kind1*start,
9268 kind1, end - start, ch, direction);
9269 if (result == -1)
9270 return -1;
9271 else
9272 return start + result;
9273 }
9274
9275 if (kind2 != kind1) {
9276 buf2 = _PyUnicode_AsKind(s2, kind1);
9277 if (!buf2)
9278 return -2;
9279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280
Victor Stinner794d5672011-10-10 03:21:36 +02009281 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009282 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009283 case PyUnicode_1BYTE_KIND:
9284 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9285 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9286 else
9287 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_2BYTE_KIND:
9290 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 case PyUnicode_4BYTE_KIND:
9293 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9294 break;
9295 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009296 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009297 }
9298 }
9299 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009300 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009301 case PyUnicode_1BYTE_KIND:
9302 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9303 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9304 else
9305 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_2BYTE_KIND:
9308 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 case PyUnicode_4BYTE_KIND:
9311 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9312 break;
9313 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009314 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 }
9317
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009318 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 PyMem_Free(buf2);
9320
9321 return result;
9322}
9323
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009324/* _PyUnicode_InsertThousandsGrouping() helper functions */
9325#include "stringlib/localeutil.h"
9326
9327/**
9328 * InsertThousandsGrouping:
9329 * @writer: Unicode writer.
9330 * @n_buffer: Number of characters in @buffer.
9331 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9332 * @d_pos: Start of digits string.
9333 * @n_digits: The number of digits in the string, in which we want
9334 * to put the grouping chars.
9335 * @min_width: The minimum width of the digits in the output string.
9336 * Output will be zero-padded on the left to fill.
9337 * @grouping: see definition in localeconv().
9338 * @thousands_sep: see definition in localeconv().
9339 *
9340 * There are 2 modes: counting and filling. If @writer is NULL,
9341 * we are in counting mode, else filling mode.
9342 * If counting, the required buffer size is returned.
9343 * If filling, we know the buffer will be large enough, so we don't
9344 * need to pass in the buffer size.
9345 * Inserts thousand grouping characters (as defined by grouping and
9346 * thousands_sep) into @writer.
9347 *
9348 * Return value: -1 on error, number of characters otherwise.
9349 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009351_PyUnicode_InsertThousandsGrouping(
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009352 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009353 Py_ssize_t n_buffer,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009354 PyObject *digits,
9355 Py_ssize_t d_pos,
9356 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 Py_ssize_t min_width,
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009358 const char *grouping,
9359 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009360 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361{
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009362 if (writer) {
9363 assert(digits != NULL);
9364 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009365 }
9366 else {
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009367 assert(digits == NULL);
9368 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009370 assert(0 <= d_pos);
9371 assert(0 <= n_digits);
9372 assert(0 <= min_width);
9373 assert(grouping != NULL);
9374
9375 if (digits != NULL) {
9376 if (PyUnicode_READY(digits) == -1) {
9377 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009378 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009379 }
9380 if (PyUnicode_READY(thousands_sep) == -1) {
9381 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 }
9383
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009384 Py_ssize_t count = 0;
9385 Py_ssize_t n_zeros;
9386 int loop_broken = 0;
9387 int use_separator = 0; /* First time through, don't append the
9388 separator. They only go between
9389 groups. */
9390 Py_ssize_t buffer_pos;
9391 Py_ssize_t digits_pos;
9392 Py_ssize_t len;
9393 Py_ssize_t n_chars;
9394 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9395 be looked at */
9396 /* A generator that returns all of the grouping widths, until it
9397 returns 0. */
9398 GroupGenerator groupgen;
9399 GroupGenerator_init(&groupgen, grouping);
9400 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9401
9402 /* if digits are not grouped, thousands separator
9403 should be an empty string */
9404 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9405
9406 digits_pos = d_pos + n_digits;
9407 if (writer) {
9408 buffer_pos = writer->pos + n_buffer;
9409 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9410 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009412 else {
9413 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009414 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009415
9416 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009417 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009418 }
Victor Stinner6f5fa1b2018-11-26 14:17:01 +01009419
9420 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9421 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9422 n_zeros = Py_MAX(0, len - remaining);
9423 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9424
9425 /* Use n_zero zero's and n_chars chars */
9426
9427 /* Count only, don't do anything. */
9428 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9429
9430 /* Copy into the writer. */
9431 InsertThousandsGrouping_fill(writer, &buffer_pos,
9432 digits, &digits_pos,
9433 n_chars, n_zeros,
9434 use_separator ? thousands_sep : NULL,
9435 thousands_sep_len, maxchar);
9436
9437 /* Use a separator next time. */
9438 use_separator = 1;
9439
9440 remaining -= n_chars;
9441 min_width -= len;
9442
9443 if (remaining <= 0 && min_width <= 0) {
9444 loop_broken = 1;
9445 break;
9446 }
9447 min_width -= thousands_sep_len;
9448 }
9449 if (!loop_broken) {
9450 /* We left the loop without using a break statement. */
9451
9452 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9453 n_zeros = Py_MAX(0, len - remaining);
9454 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9455
9456 /* Use n_zero zero's and n_chars chars */
9457 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9458
9459 /* Copy into the writer. */
9460 InsertThousandsGrouping_fill(writer, &buffer_pos,
9461 digits, &digits_pos,
9462 n_chars, n_zeros,
9463 use_separator ? thousands_sep : NULL,
9464 thousands_sep_len, maxchar);
9465 }
9466 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467}
9468
9469
Alexander Belopolsky40018472011-02-26 01:02:56 +00009470Py_ssize_t
9471PyUnicode_Count(PyObject *str,
9472 PyObject *substr,
9473 Py_ssize_t start,
9474 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009476 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009477 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 void *buf1 = NULL, *buf2 = NULL;
9479 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009480
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009481 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009483
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009484 kind1 = PyUnicode_KIND(str);
9485 kind2 = PyUnicode_KIND(substr);
9486 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009488
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009489 len1 = PyUnicode_GET_LENGTH(str);
9490 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009492 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009493 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009494
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009495 buf1 = PyUnicode_DATA(str);
9496 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009497 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009498 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009499 if (!buf2)
9500 goto onError;
9501 }
9502
9503 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009505 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009506 result = asciilib_count(
9507 ((Py_UCS1*)buf1) + start, end - start,
9508 buf2, len2, PY_SSIZE_T_MAX
9509 );
9510 else
9511 result = ucs1lib_count(
9512 ((Py_UCS1*)buf1) + start, end - start,
9513 buf2, len2, PY_SSIZE_T_MAX
9514 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 break;
9516 case PyUnicode_2BYTE_KIND:
9517 result = ucs2lib_count(
9518 ((Py_UCS2*)buf1) + start, end - start,
9519 buf2, len2, PY_SSIZE_T_MAX
9520 );
9521 break;
9522 case PyUnicode_4BYTE_KIND:
9523 result = ucs4lib_count(
9524 ((Py_UCS4*)buf1) + start, end - start,
9525 buf2, len2, PY_SSIZE_T_MAX
9526 );
9527 break;
9528 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009529 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009531
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009532 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 PyMem_Free(buf2);
9534
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009537 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 PyMem_Free(buf2);
9539 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540}
9541
Alexander Belopolsky40018472011-02-26 01:02:56 +00009542Py_ssize_t
9543PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009544 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009545 Py_ssize_t start,
9546 Py_ssize_t end,
9547 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009549 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009551
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009552 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553}
9554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555Py_ssize_t
9556PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9557 Py_ssize_t start, Py_ssize_t end,
9558 int direction)
9559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009561 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 if (PyUnicode_READY(str) == -1)
9563 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009564 len = PyUnicode_GET_LENGTH(str);
9565 ADJUST_INDICES(start, end, len);
9566 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009567 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009569 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9570 kind, end-start, ch, direction);
9571 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009573 else
9574 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575}
9576
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009578tailmatch(PyObject *self,
9579 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009580 Py_ssize_t start,
9581 Py_ssize_t end,
9582 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 int kind_self;
9585 int kind_sub;
9586 void *data_self;
9587 void *data_sub;
9588 Py_ssize_t offset;
9589 Py_ssize_t i;
9590 Py_ssize_t end_sub;
9591
9592 if (PyUnicode_READY(self) == -1 ||
9593 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009594 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9597 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009601 if (PyUnicode_GET_LENGTH(substring) == 0)
9602 return 1;
9603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 kind_self = PyUnicode_KIND(self);
9605 data_self = PyUnicode_DATA(self);
9606 kind_sub = PyUnicode_KIND(substring);
9607 data_sub = PyUnicode_DATA(substring);
9608 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9609
9610 if (direction > 0)
9611 offset = end;
9612 else
9613 offset = start;
9614
9615 if (PyUnicode_READ(kind_self, data_self, offset) ==
9616 PyUnicode_READ(kind_sub, data_sub, 0) &&
9617 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9618 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9619 /* If both are of the same kind, memcmp is sufficient */
9620 if (kind_self == kind_sub) {
9621 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009622 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 data_sub,
9624 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009625 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009627 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 else {
9629 /* We do not need to compare 0 and len(substring)-1 because
9630 the if statement above ensured already that they are equal
9631 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 for (i = 1; i < end_sub; ++i) {
9633 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9634 PyUnicode_READ(kind_sub, data_sub, i))
9635 return 0;
9636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009637 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 }
9640
9641 return 0;
9642}
9643
Alexander Belopolsky40018472011-02-26 01:02:56 +00009644Py_ssize_t
9645PyUnicode_Tailmatch(PyObject *str,
9646 PyObject *substr,
9647 Py_ssize_t start,
9648 Py_ssize_t end,
9649 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009651 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009653
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009654 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655}
9656
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657static PyObject *
9658ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9661 char *resdata, *data = PyUnicode_DATA(self);
9662 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009663
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 res = PyUnicode_New(len, 127);
9665 if (res == NULL)
9666 return NULL;
9667 resdata = PyUnicode_DATA(res);
9668 if (lower)
9669 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 _Py_bytes_upper(resdata, data, len);
9672 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673}
9674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 Py_ssize_t j;
9679 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009680 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009682
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9684
9685 where ! is a negation and \p{xxx} is a character with property xxx.
9686 */
9687 for (j = i - 1; j >= 0; j--) {
9688 c = PyUnicode_READ(kind, data, j);
9689 if (!_PyUnicode_IsCaseIgnorable(c))
9690 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009692 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9693 if (final_sigma) {
9694 for (j = i + 1; j < length; j++) {
9695 c = PyUnicode_READ(kind, data, j);
9696 if (!_PyUnicode_IsCaseIgnorable(c))
9697 break;
9698 }
9699 final_sigma = j == length || !_PyUnicode_IsCased(c);
9700 }
9701 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702}
9703
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704static int
9705lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9706 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 /* Obscure special case. */
9709 if (c == 0x3A3) {
9710 mapped[0] = handle_capital_sigma(kind, data, length, i);
9711 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714}
9715
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716static Py_ssize_t
9717do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 Py_ssize_t i, k = 0;
9720 int n_res, j;
9721 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009722
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009723 c = PyUnicode_READ(kind, data, 0);
9724 n_res = _PyUnicode_ToUpperFull(c, mapped);
9725 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009726 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 for (i = 1; i < length; i++) {
9730 c = PyUnicode_READ(kind, data, i);
9731 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9732 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009733 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009735 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009736 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738}
9739
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740static Py_ssize_t
9741do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9742 Py_ssize_t i, k = 0;
9743
9744 for (i = 0; i < length; i++) {
9745 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9746 int n_res, j;
9747 if (Py_UNICODE_ISUPPER(c)) {
9748 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9749 }
9750 else if (Py_UNICODE_ISLOWER(c)) {
9751 n_res = _PyUnicode_ToUpperFull(c, mapped);
9752 }
9753 else {
9754 n_res = 1;
9755 mapped[0] = c;
9756 }
9757 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009758 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 res[k++] = mapped[j];
9760 }
9761 }
9762 return k;
9763}
9764
9765static Py_ssize_t
9766do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9767 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009769 Py_ssize_t i, k = 0;
9770
9771 for (i = 0; i < length; i++) {
9772 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9773 int n_res, j;
9774 if (lower)
9775 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9776 else
9777 n_res = _PyUnicode_ToUpperFull(c, mapped);
9778 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009779 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009780 res[k++] = mapped[j];
9781 }
9782 }
9783 return k;
9784}
9785
9786static Py_ssize_t
9787do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9788{
9789 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9790}
9791
9792static Py_ssize_t
9793do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9794{
9795 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9796}
9797
Benjamin Petersone51757f2012-01-12 21:10:29 -05009798static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009799do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9800{
9801 Py_ssize_t i, k = 0;
9802
9803 for (i = 0; i < length; i++) {
9804 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9805 Py_UCS4 mapped[3];
9806 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9807 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009808 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009809 res[k++] = mapped[j];
9810 }
9811 }
9812 return k;
9813}
9814
9815static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009816do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9817{
9818 Py_ssize_t i, k = 0;
9819 int previous_is_cased;
9820
9821 previous_is_cased = 0;
9822 for (i = 0; i < length; i++) {
9823 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9824 Py_UCS4 mapped[3];
9825 int n_res, j;
9826
9827 if (previous_is_cased)
9828 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9829 else
9830 n_res = _PyUnicode_ToTitleFull(c, mapped);
9831
9832 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009833 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009834 res[k++] = mapped[j];
9835 }
9836
9837 previous_is_cased = _PyUnicode_IsCased(c);
9838 }
9839 return k;
9840}
9841
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842static PyObject *
9843case_operation(PyObject *self,
9844 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9845{
9846 PyObject *res = NULL;
9847 Py_ssize_t length, newlength = 0;
9848 int kind, outkind;
9849 void *data, *outdata;
9850 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9851
Benjamin Petersoneea48462012-01-16 14:28:50 -05009852 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853
9854 kind = PyUnicode_KIND(self);
9855 data = PyUnicode_DATA(self);
9856 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009857 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009858 PyErr_SetString(PyExc_OverflowError, "string is too long");
9859 return NULL;
9860 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009861 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009862 if (tmp == NULL)
9863 return PyErr_NoMemory();
9864 newlength = perform(kind, data, length, tmp, &maxchar);
9865 res = PyUnicode_New(newlength, maxchar);
9866 if (res == NULL)
9867 goto leave;
9868 tmpend = tmp + newlength;
9869 outdata = PyUnicode_DATA(res);
9870 outkind = PyUnicode_KIND(res);
9871 switch (outkind) {
9872 case PyUnicode_1BYTE_KIND:
9873 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9874 break;
9875 case PyUnicode_2BYTE_KIND:
9876 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9877 break;
9878 case PyUnicode_4BYTE_KIND:
9879 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9880 break;
9881 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009882 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883 }
9884 leave:
9885 PyMem_FREE(tmp);
9886 return res;
9887}
9888
Tim Peters8ce9f162004-08-27 01:49:32 +00009889PyObject *
9890PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009892 PyObject *res;
9893 PyObject *fseq;
9894 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009895 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009897 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009898 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009900 }
9901
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009902 /* NOTE: the following code can't call back into Python code,
9903 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009904 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009905
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009906 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009908 res = _PyUnicode_JoinArray(separator, items, seqlen);
9909 Py_DECREF(fseq);
9910 return res;
9911}
9912
9913PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009914_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009915{
9916 PyObject *res = NULL; /* the result */
9917 PyObject *sep = NULL;
9918 Py_ssize_t seplen;
9919 PyObject *item;
9920 Py_ssize_t sz, i, res_offset;
9921 Py_UCS4 maxchar;
9922 Py_UCS4 item_maxchar;
9923 int use_memcpy;
9924 unsigned char *res_data = NULL, *sep_data = NULL;
9925 PyObject *last_obj;
9926 unsigned int kind = 0;
9927
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 /* If empty sequence, return u"". */
9929 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009930 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009932
Tim Peters05eba1f2004-08-27 21:32:02 +00009933 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009934 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009935 if (seqlen == 1) {
9936 if (PyUnicode_CheckExact(items[0])) {
9937 res = items[0];
9938 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 return res;
9940 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009941 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009942 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009943 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009944 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009945 /* Set up sep and seplen */
9946 if (separator == NULL) {
9947 /* fall back to a blank space separator */
9948 sep = PyUnicode_FromOrdinal(' ');
9949 if (!sep)
9950 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009951 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009952 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009954 else {
9955 if (!PyUnicode_Check(separator)) {
9956 PyErr_Format(PyExc_TypeError,
9957 "separator: expected str instance,"
9958 " %.80s found",
9959 Py_TYPE(separator)->tp_name);
9960 goto onError;
9961 }
9962 if (PyUnicode_READY(separator))
9963 goto onError;
9964 sep = separator;
9965 seplen = PyUnicode_GET_LENGTH(separator);
9966 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9967 /* inc refcount to keep this code path symmetric with the
9968 above case of a blank separator */
9969 Py_INCREF(sep);
9970 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009972 }
9973
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009974 /* There are at least two things to join, or else we have a subclass
9975 * of str in the sequence.
9976 * Do a pre-pass to figure out the total amount of space we'll
9977 * need (sz), and see whether all argument are strings.
9978 */
9979 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009980#ifdef Py_DEBUG
9981 use_memcpy = 0;
9982#else
9983 use_memcpy = 1;
9984#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009985 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009986 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009987 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 if (!PyUnicode_Check(item)) {
9989 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009990 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009991 " %.80s found",
9992 i, Py_TYPE(item)->tp_name);
9993 goto onError;
9994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 if (PyUnicode_READY(item) == -1)
9996 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009997 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009999 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010000 if (i != 0) {
10001 add_sz += seplen;
10002 }
10003 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 goto onError;
10007 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010008 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010009 if (use_memcpy && last_obj != NULL) {
10010 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10011 use_memcpy = 0;
10012 }
10013 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 }
Tim Petersced69f82003-09-16 20:30:58 +000010015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 if (res == NULL)
10018 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010019
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010021#ifdef Py_DEBUG
10022 use_memcpy = 0;
10023#else
10024 if (use_memcpy) {
10025 res_data = PyUnicode_1BYTE_DATA(res);
10026 kind = PyUnicode_KIND(res);
10027 if (seplen != 0)
10028 sep_data = PyUnicode_1BYTE_DATA(sep);
10029 }
10030#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010031 if (use_memcpy) {
10032 for (i = 0; i < seqlen; ++i) {
10033 Py_ssize_t itemlen;
10034 item = items[i];
10035
10036 /* Copy item, and maybe the separator. */
10037 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010038 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010040 kind * seplen);
10041 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010043
10044 itemlen = PyUnicode_GET_LENGTH(item);
10045 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010046 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010048 kind * itemlen);
10049 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010051 }
10052 assert(res_data == PyUnicode_1BYTE_DATA(res)
10053 + kind * PyUnicode_GET_LENGTH(res));
10054 }
10055 else {
10056 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10057 Py_ssize_t itemlen;
10058 item = items[i];
10059
10060 /* Copy item, and maybe the separator. */
10061 if (i && seplen != 0) {
10062 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10063 res_offset += seplen;
10064 }
10065
10066 itemlen = PyUnicode_GET_LENGTH(item);
10067 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010068 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 res_offset += itemlen;
10070 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010071 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010073 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010076 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010081 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 return NULL;
10083}
10084
Victor Stinnerd3f08822012-05-29 12:57:52 +020010085void
10086_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10087 Py_UCS4 fill_char)
10088{
10089 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner7f9fb0f2018-11-27 12:42:04 +010010090 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010091 assert(PyUnicode_IS_READY(unicode));
10092 assert(unicode_modifiable(unicode));
10093 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10094 assert(start >= 0);
10095 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10096 FILL(kind, data, fill_char, start, length);
10097}
10098
Victor Stinner3fe55312012-01-04 00:33:50 +010010099Py_ssize_t
10100PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10101 Py_UCS4 fill_char)
10102{
10103 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010104
10105 if (!PyUnicode_Check(unicode)) {
10106 PyErr_BadInternalCall();
10107 return -1;
10108 }
10109 if (PyUnicode_READY(unicode) == -1)
10110 return -1;
10111 if (unicode_check_modifiable(unicode))
10112 return -1;
10113
Victor Stinnerd3f08822012-05-29 12:57:52 +020010114 if (start < 0) {
10115 PyErr_SetString(PyExc_IndexError, "string index out of range");
10116 return -1;
10117 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010118 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10119 PyErr_SetString(PyExc_ValueError,
10120 "fill character is bigger than "
10121 "the string maximum character");
10122 return -1;
10123 }
10124
10125 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10126 length = Py_MIN(maxlen, length);
10127 if (length <= 0)
10128 return 0;
10129
Victor Stinnerd3f08822012-05-29 12:57:52 +020010130 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010131 return length;
10132}
10133
Victor Stinner9310abb2011-10-05 00:59:23 +020010134static PyObject *
10135pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010136 Py_ssize_t left,
10137 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 PyObject *u;
10141 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010142 int kind;
10143 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144
10145 if (left < 0)
10146 left = 0;
10147 if (right < 0)
10148 right = 0;
10149
Victor Stinnerc4b49542011-12-11 22:44:26 +010010150 if (left == 0 && right == 0)
10151 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10154 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010155 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10156 return NULL;
10157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010159 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010161 if (!u)
10162 return NULL;
10163
10164 kind = PyUnicode_KIND(u);
10165 data = PyUnicode_DATA(u);
10166 if (left)
10167 FILL(kind, data, fill, 0, left);
10168 if (right)
10169 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010170 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010171 assert(_PyUnicode_CheckConsistency(u, 1));
10172 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173}
10174
Alexander Belopolsky40018472011-02-26 01:02:56 +000010175PyObject *
10176PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010180 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Benjamin Petersonead6b532011-12-20 17:23:42 -060010183 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 if (PyUnicode_IS_ASCII(string))
10186 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 PyUnicode_GET_LENGTH(string), keepends);
10189 else
10190 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 case PyUnicode_2BYTE_KIND:
10195 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 PyUnicode_GET_LENGTH(string), keepends);
10198 break;
10199 case PyUnicode_4BYTE_KIND:
10200 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_GET_LENGTH(string), keepends);
10203 break;
10204 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010205 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208}
10209
Alexander Belopolsky40018472011-02-26 01:02:56 +000010210static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010211split(PyObject *self,
10212 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010213 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010215 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 void *buf1, *buf2;
10217 Py_ssize_t len1, len2;
10218 PyObject* out;
10219
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010221 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (PyUnicode_READY(self) == -1)
10224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010227 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 if (PyUnicode_IS_ASCII(self))
10230 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010231 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010232 PyUnicode_GET_LENGTH(self), maxcount
10233 );
10234 else
10235 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010236 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 PyUnicode_GET_LENGTH(self), maxcount
10238 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 case PyUnicode_2BYTE_KIND:
10240 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 PyUnicode_GET_LENGTH(self), maxcount
10243 );
10244 case PyUnicode_4BYTE_KIND:
10245 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010246 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyUnicode_GET_LENGTH(self), maxcount
10248 );
10249 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010250 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 }
10252
10253 if (PyUnicode_READY(substring) == -1)
10254 return NULL;
10255
10256 kind1 = PyUnicode_KIND(self);
10257 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 len1 = PyUnicode_GET_LENGTH(self);
10259 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010260 if (kind1 < kind2 || len1 < len2) {
10261 out = PyList_New(1);
10262 if (out == NULL)
10263 return NULL;
10264 Py_INCREF(self);
10265 PyList_SET_ITEM(out, 0, self);
10266 return out;
10267 }
10268 buf1 = PyUnicode_DATA(self);
10269 buf2 = PyUnicode_DATA(substring);
10270 if (kind2 != kind1) {
10271 buf2 = _PyUnicode_AsKind(substring, kind1);
10272 if (!buf2)
10273 return NULL;
10274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010276 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10279 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 else
10282 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 break;
10285 case PyUnicode_2BYTE_KIND:
10286 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 break;
10289 case PyUnicode_4BYTE_KIND:
10290 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 default:
10294 out = NULL;
10295 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010296 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 PyMem_Free(buf2);
10298 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299}
10300
Alexander Belopolsky40018472011-02-26 01:02:56 +000010301static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010302rsplit(PyObject *self,
10303 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010304 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010305{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010306 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 void *buf1, *buf2;
10308 Py_ssize_t len1, len2;
10309 PyObject* out;
10310
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010311 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010312 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (PyUnicode_READY(self) == -1)
10315 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010318 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 if (PyUnicode_IS_ASCII(self))
10321 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010322 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 PyUnicode_GET_LENGTH(self), maxcount
10324 );
10325 else
10326 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 PyUnicode_GET_LENGTH(self), maxcount
10329 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 case PyUnicode_2BYTE_KIND:
10331 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 PyUnicode_GET_LENGTH(self), maxcount
10334 );
10335 case PyUnicode_4BYTE_KIND:
10336 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyUnicode_GET_LENGTH(self), maxcount
10339 );
10340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010341 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 }
10343
10344 if (PyUnicode_READY(substring) == -1)
10345 return NULL;
10346
10347 kind1 = PyUnicode_KIND(self);
10348 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 len1 = PyUnicode_GET_LENGTH(self);
10350 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010351 if (kind1 < kind2 || len1 < len2) {
10352 out = PyList_New(1);
10353 if (out == NULL)
10354 return NULL;
10355 Py_INCREF(self);
10356 PyList_SET_ITEM(out, 0, self);
10357 return out;
10358 }
10359 buf1 = PyUnicode_DATA(self);
10360 buf2 = PyUnicode_DATA(substring);
10361 if (kind2 != kind1) {
10362 buf2 = _PyUnicode_AsKind(substring, kind1);
10363 if (!buf2)
10364 return NULL;
10365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010367 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10370 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010371 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 else
10373 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 break;
10376 case PyUnicode_2BYTE_KIND:
10377 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010378 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 break;
10380 case PyUnicode_4BYTE_KIND:
10381 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 default:
10385 out = NULL;
10386 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010387 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 PyMem_Free(buf2);
10389 return out;
10390}
10391
10392static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10394 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010396 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10399 return asciilib_find(buf1, len1, buf2, len2, offset);
10400 else
10401 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 case PyUnicode_2BYTE_KIND:
10403 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10404 case PyUnicode_4BYTE_KIND:
10405 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10406 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010407 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408}
10409
10410static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010411anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10412 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010414 switch (kind) {
10415 case PyUnicode_1BYTE_KIND:
10416 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10417 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10418 else
10419 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10420 case PyUnicode_2BYTE_KIND:
10421 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10422 case PyUnicode_4BYTE_KIND:
10423 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10424 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010425 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010426}
10427
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010428static void
10429replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10430 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10431{
10432 int kind = PyUnicode_KIND(u);
10433 void *data = PyUnicode_DATA(u);
10434 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10435 if (kind == PyUnicode_1BYTE_KIND) {
10436 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10437 (Py_UCS1 *)data + len,
10438 u1, u2, maxcount);
10439 }
10440 else if (kind == PyUnicode_2BYTE_KIND) {
10441 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10442 (Py_UCS2 *)data + len,
10443 u1, u2, maxcount);
10444 }
10445 else {
10446 assert(kind == PyUnicode_4BYTE_KIND);
10447 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10448 (Py_UCS4 *)data + len,
10449 u1, u2, maxcount);
10450 }
10451}
10452
Alexander Belopolsky40018472011-02-26 01:02:56 +000010453static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454replace(PyObject *self, PyObject *str1,
10455 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 PyObject *u;
10458 char *sbuf = PyUnicode_DATA(self);
10459 char *buf1 = PyUnicode_DATA(str1);
10460 char *buf2 = PyUnicode_DATA(str2);
10461 int srelease = 0, release1 = 0, release2 = 0;
10462 int skind = PyUnicode_KIND(self);
10463 int kind1 = PyUnicode_KIND(str1);
10464 int kind2 = PyUnicode_KIND(str2);
10465 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10466 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10467 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010468 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010469 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
10471 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010474 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
Victor Stinner59de0ee2011-10-07 10:01:28 +020010476 if (str1 == str2)
10477 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478
Victor Stinner49a0a212011-10-12 23:46:10 +020010479 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010480 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10481 if (maxchar < maxchar_str1)
10482 /* substring too wide to be present */
10483 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010484 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10485 /* Replacing str1 with str2 may cause a maxchar reduction in the
10486 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010487 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010488 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010491 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010493 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010495 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010498
Victor Stinner69ed0f42013-04-09 21:48:24 +020010499 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010500 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010501 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010503 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010507
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10509 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010510 }
10511 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 int rkind = skind;
10513 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010514 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (kind1 < rkind) {
10517 /* widen substring */
10518 buf1 = _PyUnicode_AsKind(str1, rkind);
10519 if (!buf1) goto error;
10520 release1 = 1;
10521 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010522 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 if (i < 0)
10524 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (rkind > kind2) {
10526 /* widen replacement */
10527 buf2 = _PyUnicode_AsKind(str2, rkind);
10528 if (!buf2) goto error;
10529 release2 = 1;
10530 }
10531 else if (rkind < kind2) {
10532 /* widen self and buf1 */
10533 rkind = kind2;
10534 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010535 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 sbuf = _PyUnicode_AsKind(self, rkind);
10537 if (!sbuf) goto error;
10538 srelease = 1;
10539 buf1 = _PyUnicode_AsKind(str1, rkind);
10540 if (!buf1) goto error;
10541 release1 = 1;
10542 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 u = PyUnicode_New(slen, maxchar);
10544 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 assert(PyUnicode_KIND(u) == rkind);
10547 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010548
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010549 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010550 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010551 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010553 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010555
10556 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010558 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010559 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010560 if (i == -1)
10561 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010562 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010564 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010568 }
10569 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010571 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 int rkind = skind;
10573 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 buf1 = _PyUnicode_AsKind(str1, rkind);
10578 if (!buf1) goto error;
10579 release1 = 1;
10580 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010582 if (n == 0)
10583 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010585 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 buf2 = _PyUnicode_AsKind(str2, rkind);
10587 if (!buf2) goto error;
10588 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010591 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 rkind = kind2;
10593 sbuf = _PyUnicode_AsKind(self, rkind);
10594 if (!sbuf) goto error;
10595 srelease = 1;
10596 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010597 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf1 = _PyUnicode_AsKind(str1, rkind);
10599 if (!buf1) goto error;
10600 release1 = 1;
10601 }
10602 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10603 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010604 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 PyErr_SetString(PyExc_OverflowError,
10606 "replace string is too long");
10607 goto error;
10608 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010609 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010610 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010611 _Py_INCREF_UNICODE_EMPTY();
10612 if (!unicode_empty)
10613 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 u = unicode_empty;
10615 goto done;
10616 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010617 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 PyErr_SetString(PyExc_OverflowError,
10619 "replace string is too long");
10620 goto error;
10621 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010622 u = PyUnicode_New(new_size, maxchar);
10623 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 assert(PyUnicode_KIND(u) == rkind);
10626 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 ires = i = 0;
10628 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 while (n-- > 0) {
10630 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010631 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010633 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010634 if (j == -1)
10635 break;
10636 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010638 memcpy(res + rkind * ires,
10639 sbuf + rkind * i,
10640 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 }
10643 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010647 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 memcpy(res + rkind * ires,
10655 sbuf + rkind * i,
10656 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 }
10658 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 /* interleave */
10660 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010663 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 if (--n <= 0)
10666 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
10668 sbuf + rkind * i,
10669 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 ires++;
10671 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
10674 sbuf + rkind * i,
10675 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010676 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010677 }
10678
10679 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010680 unicode_adjust_maxchar(&u);
10681 if (u == NULL)
10682 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010684
10685 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (srelease)
10687 PyMem_FREE(sbuf);
10688 if (release1)
10689 PyMem_FREE(buf1);
10690 if (release2)
10691 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010692 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (srelease)
10698 PyMem_FREE(sbuf);
10699 if (release1)
10700 PyMem_FREE(buf1);
10701 if (release2)
10702 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010703 return unicode_result_unchanged(self);
10704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 error:
10706 if (srelease && sbuf)
10707 PyMem_FREE(sbuf);
10708 if (release1 && buf1)
10709 PyMem_FREE(buf1);
10710 if (release2 && buf2)
10711 PyMem_FREE(buf2);
10712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713}
10714
10715/* --- Unicode Object Methods --------------------------------------------- */
10716
INADA Naoki3ae20562017-01-16 20:41:20 +090010717/*[clinic input]
10718str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
INADA Naoki3ae20562017-01-16 20:41:20 +090010720Return a version of the string where each word is titlecased.
10721
10722More specifically, words start with uppercased characters and all remaining
10723cased characters have lower case.
10724[clinic start generated code]*/
10725
10726static PyObject *
10727unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010728/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010730 if (PyUnicode_READY(self) == -1)
10731 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010732 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733}
10734
INADA Naoki3ae20562017-01-16 20:41:20 +090010735/*[clinic input]
10736str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737
INADA Naoki3ae20562017-01-16 20:41:20 +090010738Return a capitalized version of the string.
10739
10740More specifically, make the first character have upper case and the rest lower
10741case.
10742[clinic start generated code]*/
10743
10744static PyObject *
10745unicode_capitalize_impl(PyObject *self)
10746/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010748 if (PyUnicode_READY(self) == -1)
10749 return NULL;
10750 if (PyUnicode_GET_LENGTH(self) == 0)
10751 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010752 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753}
10754
INADA Naoki3ae20562017-01-16 20:41:20 +090010755/*[clinic input]
10756str.casefold as unicode_casefold
10757
10758Return a version of the string suitable for caseless comparisons.
10759[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010760
10761static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010762unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010763/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010764{
10765 if (PyUnicode_READY(self) == -1)
10766 return NULL;
10767 if (PyUnicode_IS_ASCII(self))
10768 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010769 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010770}
10771
10772
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010773/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010774
10775static int
10776convert_uc(PyObject *obj, void *addr)
10777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010779
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010780 if (!PyUnicode_Check(obj)) {
10781 PyErr_Format(PyExc_TypeError,
10782 "The fill character must be a unicode character, "
10783 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010784 return 0;
10785 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010786 if (PyUnicode_READY(obj) < 0)
10787 return 0;
10788 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010789 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 return 0;
10792 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010793 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795}
10796
INADA Naoki3ae20562017-01-16 20:41:20 +090010797/*[clinic input]
10798str.center as unicode_center
10799
10800 width: Py_ssize_t
10801 fillchar: Py_UCS4 = ' '
10802 /
10803
10804Return a centered string of length width.
10805
10806Padding is done using the specified fill character (default is a space).
10807[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808
10809static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010810unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10811/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010813 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814
Benjamin Petersonbac79492012-01-14 13:34:47 -050010815 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816 return NULL;
10817
Victor Stinnerc4b49542011-12-11 22:44:26 +010010818 if (PyUnicode_GET_LENGTH(self) >= width)
10819 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
Victor Stinnerc4b49542011-12-11 22:44:26 +010010821 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 left = marg / 2 + (marg & width & 1);
10823
Victor Stinner9310abb2011-10-05 00:59:23 +020010824 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825}
10826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827/* This function assumes that str1 and str2 are readied by the caller. */
10828
Marc-André Lemburge5034372000-08-08 08:04:29 +000010829static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010830unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010831{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010832#define COMPARE(TYPE1, TYPE2) \
10833 do { \
10834 TYPE1* p1 = (TYPE1 *)data1; \
10835 TYPE2* p2 = (TYPE2 *)data2; \
10836 TYPE1* end = p1 + len; \
10837 Py_UCS4 c1, c2; \
10838 for (; p1 != end; p1++, p2++) { \
10839 c1 = *p1; \
10840 c2 = *p2; \
10841 if (c1 != c2) \
10842 return (c1 < c2) ? -1 : 1; \
10843 } \
10844 } \
10845 while (0)
10846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 int kind1, kind2;
10848 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010849 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 kind1 = PyUnicode_KIND(str1);
10852 kind2 = PyUnicode_KIND(str2);
10853 data1 = PyUnicode_DATA(str1);
10854 data2 = PyUnicode_DATA(str2);
10855 len1 = PyUnicode_GET_LENGTH(str1);
10856 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010857 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010858
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 switch(kind1) {
10860 case PyUnicode_1BYTE_KIND:
10861 {
10862 switch(kind2) {
10863 case PyUnicode_1BYTE_KIND:
10864 {
10865 int cmp = memcmp(data1, data2, len);
10866 /* normalize result of memcmp() into the range [-1; 1] */
10867 if (cmp < 0)
10868 return -1;
10869 if (cmp > 0)
10870 return 1;
10871 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010872 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010873 case PyUnicode_2BYTE_KIND:
10874 COMPARE(Py_UCS1, Py_UCS2);
10875 break;
10876 case PyUnicode_4BYTE_KIND:
10877 COMPARE(Py_UCS1, Py_UCS4);
10878 break;
10879 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010880 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010881 }
10882 break;
10883 }
10884 case PyUnicode_2BYTE_KIND:
10885 {
10886 switch(kind2) {
10887 case PyUnicode_1BYTE_KIND:
10888 COMPARE(Py_UCS2, Py_UCS1);
10889 break;
10890 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010891 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 COMPARE(Py_UCS2, Py_UCS2);
10893 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010894 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010895 case PyUnicode_4BYTE_KIND:
10896 COMPARE(Py_UCS2, Py_UCS4);
10897 break;
10898 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010899 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010900 }
10901 break;
10902 }
10903 case PyUnicode_4BYTE_KIND:
10904 {
10905 switch(kind2) {
10906 case PyUnicode_1BYTE_KIND:
10907 COMPARE(Py_UCS4, Py_UCS1);
10908 break;
10909 case PyUnicode_2BYTE_KIND:
10910 COMPARE(Py_UCS4, Py_UCS2);
10911 break;
10912 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010913 {
10914#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10915 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10916 /* normalize result of wmemcmp() into the range [-1; 1] */
10917 if (cmp < 0)
10918 return -1;
10919 if (cmp > 0)
10920 return 1;
10921#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010925 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010926 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010927 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010928 }
10929 break;
10930 }
10931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010932 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010933 }
10934
Victor Stinner770e19e2012-10-04 22:59:45 +020010935 if (len1 == len2)
10936 return 0;
10937 if (len1 < len2)
10938 return -1;
10939 else
10940 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010941
10942#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010943}
10944
Benjamin Peterson621b4302016-09-09 13:54:34 -070010945static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010946unicode_compare_eq(PyObject *str1, PyObject *str2)
10947{
10948 int kind;
10949 void *data1, *data2;
10950 Py_ssize_t len;
10951 int cmp;
10952
Victor Stinnere5567ad2012-10-23 02:48:49 +020010953 len = PyUnicode_GET_LENGTH(str1);
10954 if (PyUnicode_GET_LENGTH(str2) != len)
10955 return 0;
10956 kind = PyUnicode_KIND(str1);
10957 if (PyUnicode_KIND(str2) != kind)
10958 return 0;
10959 data1 = PyUnicode_DATA(str1);
10960 data2 = PyUnicode_DATA(str2);
10961
10962 cmp = memcmp(data1, data2, len * kind);
10963 return (cmp == 0);
10964}
10965
10966
Alexander Belopolsky40018472011-02-26 01:02:56 +000010967int
10968PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10971 if (PyUnicode_READY(left) == -1 ||
10972 PyUnicode_READY(right) == -1)
10973 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010974
10975 /* a string is equal to itself */
10976 if (left == right)
10977 return 0;
10978
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010979 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010981 PyErr_Format(PyExc_TypeError,
10982 "Can't compare %.100s and %.100s",
10983 left->ob_type->tp_name,
10984 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 return -1;
10986}
10987
Martin v. Löwis5b222132007-06-10 09:51:05 +000010988int
10989PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 Py_ssize_t i;
10992 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010994 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995
Victor Stinner910337b2011-10-03 03:20:16 +020010996 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010997 if (!PyUnicode_IS_READY(uni)) {
10998 const wchar_t *ws = _PyUnicode_WSTR(uni);
10999 /* Compare Unicode string and source character set string */
11000 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11001 if (chr != ustr[i])
11002 return (chr < ustr[i]) ? -1 : 1;
11003 }
11004 /* This check keeps Python strings that end in '\0' from comparing equal
11005 to C strings identical up to that point. */
11006 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11007 return 1; /* uni is longer */
11008 if (ustr[i])
11009 return -1; /* str is longer */
11010 return 0;
11011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011013 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011014 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011015 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011016 size_t len, len2 = strlen(str);
11017 int cmp;
11018
11019 len = Py_MIN(len1, len2);
11020 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011021 if (cmp != 0) {
11022 if (cmp < 0)
11023 return -1;
11024 else
11025 return 1;
11026 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011027 if (len1 > len2)
11028 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011029 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011030 return -1; /* str is longer */
11031 return 0;
11032 }
11033 else {
11034 void *data = PyUnicode_DATA(uni);
11035 /* Compare Unicode string and source character set string */
11036 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011037 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011038 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11039 /* This check keeps Python strings that end in '\0' from comparing equal
11040 to C strings identical up to that point. */
11041 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11042 return 1; /* uni is longer */
11043 if (str[i])
11044 return -1; /* str is longer */
11045 return 0;
11046 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011047}
11048
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011049static int
11050non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11051{
11052 size_t i, len;
11053 const wchar_t *p;
11054 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11055 if (strlen(str) != len)
11056 return 0;
11057 p = _PyUnicode_WSTR(unicode);
11058 assert(p);
11059 for (i = 0; i < len; i++) {
11060 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011061 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011062 return 0;
11063 }
11064 return 1;
11065}
11066
11067int
11068_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11069{
11070 size_t len;
11071 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011072 assert(str);
11073#ifndef NDEBUG
11074 for (const char *p = str; *p; p++) {
11075 assert((unsigned char)*p < 128);
11076 }
11077#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011078 if (PyUnicode_READY(unicode) == -1) {
11079 /* Memory error or bad data */
11080 PyErr_Clear();
11081 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11082 }
11083 if (!PyUnicode_IS_ASCII(unicode))
11084 return 0;
11085 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11086 return strlen(str) == len &&
11087 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11088}
11089
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011090int
11091_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11092{
11093 PyObject *right_uni;
11094 Py_hash_t hash;
11095
11096 assert(_PyUnicode_CHECK(left));
11097 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011098#ifndef NDEBUG
11099 for (const char *p = right->string; *p; p++) {
11100 assert((unsigned char)*p < 128);
11101 }
11102#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011103
11104 if (PyUnicode_READY(left) == -1) {
11105 /* memory error or bad data */
11106 PyErr_Clear();
11107 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11108 }
11109
11110 if (!PyUnicode_IS_ASCII(left))
11111 return 0;
11112
11113 right_uni = _PyUnicode_FromId(right); /* borrowed */
11114 if (right_uni == NULL) {
11115 /* memory error or bad data */
11116 PyErr_Clear();
11117 return _PyUnicode_EqualToASCIIString(left, right->string);
11118 }
11119
11120 if (left == right_uni)
11121 return 1;
11122
11123 if (PyUnicode_CHECK_INTERNED(left))
11124 return 0;
11125
INADA Naoki7cc95f52018-01-28 02:07:09 +090011126 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011127 hash = _PyUnicode_HASH(left);
11128 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11129 return 0;
11130
11131 return unicode_compare_eq(left, right_uni);
11132}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011133
Alexander Belopolsky40018472011-02-26 01:02:56 +000011134PyObject *
11135PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011136{
11137 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011138
Victor Stinnere5567ad2012-10-23 02:48:49 +020011139 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11140 Py_RETURN_NOTIMPLEMENTED;
11141
11142 if (PyUnicode_READY(left) == -1 ||
11143 PyUnicode_READY(right) == -1)
11144 return NULL;
11145
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011146 if (left == right) {
11147 switch (op) {
11148 case Py_EQ:
11149 case Py_LE:
11150 case Py_GE:
11151 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011152 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011153 case Py_NE:
11154 case Py_LT:
11155 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011156 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011157 default:
11158 PyErr_BadArgument();
11159 return NULL;
11160 }
11161 }
11162 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011163 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011164 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011165 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011166 }
11167 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011168 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011169 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011170 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011171}
11172
Alexander Belopolsky40018472011-02-26 01:02:56 +000011173int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011174_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11175{
11176 return unicode_eq(aa, bb);
11177}
11178
11179int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011180PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011181{
Victor Stinner77282cb2013-04-14 19:22:47 +020011182 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 void *buf1, *buf2;
11184 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011185 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011186
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011187 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011189 "'in <string>' requires string as left operand, not %.100s",
11190 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011191 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011192 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011193 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011194 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011195 if (ensure_unicode(str) < 0)
11196 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011199 kind2 = PyUnicode_KIND(substr);
11200 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011201 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011203 len2 = PyUnicode_GET_LENGTH(substr);
11204 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011205 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011206 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011207 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011208 if (len2 == 1) {
11209 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11210 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011211 return result;
11212 }
11213 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011214 buf2 = _PyUnicode_AsKind(substr, kind1);
11215 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011216 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218
Victor Stinner77282cb2013-04-14 19:22:47 +020011219 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 case PyUnicode_1BYTE_KIND:
11221 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11222 break;
11223 case PyUnicode_2BYTE_KIND:
11224 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11225 break;
11226 case PyUnicode_4BYTE_KIND:
11227 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11228 break;
11229 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011230 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011232
Victor Stinner77282cb2013-04-14 19:22:47 +020011233 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 PyMem_Free(buf2);
11235
Guido van Rossum403d68b2000-03-13 15:55:09 +000011236 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011237}
11238
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239/* Concat to string or Unicode object giving a new Unicode object. */
11240
Alexander Belopolsky40018472011-02-26 01:02:56 +000011241PyObject *
11242PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011245 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011246 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011248 if (ensure_unicode(left) < 0)
11249 return NULL;
11250
11251 if (!PyUnicode_Check(right)) {
11252 PyErr_Format(PyExc_TypeError,
11253 "can only concatenate str (not \"%.200s\") to str",
11254 right->ob_type->tp_name);
11255 return NULL;
11256 }
11257 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011261 if (left == unicode_empty)
11262 return PyUnicode_FromObject(right);
11263 if (right == unicode_empty)
11264 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011266 left_len = PyUnicode_GET_LENGTH(left);
11267 right_len = PyUnicode_GET_LENGTH(right);
11268 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011269 PyErr_SetString(PyExc_OverflowError,
11270 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011271 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011272 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011273 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011274
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011275 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11276 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011277 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011280 result = PyUnicode_New(new_len, maxchar);
11281 if (result == NULL)
11282 return NULL;
11283 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11284 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11285 assert(_PyUnicode_CheckConsistency(result, 1));
11286 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287}
11288
Walter Dörwald1ab83302007-05-18 17:15:44 +000011289void
Victor Stinner23e56682011-10-03 03:54:37 +020011290PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011291{
Victor Stinner23e56682011-10-03 03:54:37 +020011292 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011293 Py_UCS4 maxchar, maxchar2;
11294 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011295
11296 if (p_left == NULL) {
11297 if (!PyErr_Occurred())
11298 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011299 return;
11300 }
Victor Stinner23e56682011-10-03 03:54:37 +020011301 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011302 if (right == NULL || left == NULL
11303 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011304 if (!PyErr_Occurred())
11305 PyErr_BadInternalCall();
11306 goto error;
11307 }
11308
Benjamin Petersonbac79492012-01-14 13:34:47 -050011309 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011310 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011311 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011312 goto error;
11313
Victor Stinner488fa492011-12-12 00:01:39 +010011314 /* Shortcuts */
11315 if (left == unicode_empty) {
11316 Py_DECREF(left);
11317 Py_INCREF(right);
11318 *p_left = right;
11319 return;
11320 }
11321 if (right == unicode_empty)
11322 return;
11323
11324 left_len = PyUnicode_GET_LENGTH(left);
11325 right_len = PyUnicode_GET_LENGTH(right);
11326 if (left_len > PY_SSIZE_T_MAX - right_len) {
11327 PyErr_SetString(PyExc_OverflowError,
11328 "strings are too large to concat");
11329 goto error;
11330 }
11331 new_len = left_len + right_len;
11332
11333 if (unicode_modifiable(left)
11334 && PyUnicode_CheckExact(right)
11335 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011336 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11337 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011338 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011339 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011340 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11341 {
11342 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011343 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011344 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011345
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011346 /* copy 'right' into the newly allocated area of 'left' */
11347 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011348 }
Victor Stinner488fa492011-12-12 00:01:39 +010011349 else {
11350 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11351 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011352 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011353
Victor Stinner488fa492011-12-12 00:01:39 +010011354 /* Concat the two Unicode strings */
11355 res = PyUnicode_New(new_len, maxchar);
11356 if (res == NULL)
11357 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011358 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11359 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011360 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011361 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011362 }
11363 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011364 return;
11365
11366error:
Victor Stinner488fa492011-12-12 00:01:39 +010011367 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011368}
11369
11370void
11371PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011373 PyUnicode_Append(pleft, right);
11374 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011375}
11376
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011377/*
11378Wraps stringlib_parse_args_finds() and additionally ensures that the
11379first argument is a unicode object.
11380*/
11381
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011382static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011383parse_args_finds_unicode(const char * function_name, PyObject *args,
11384 PyObject **substring,
11385 Py_ssize_t *start, Py_ssize_t *end)
11386{
11387 if(stringlib_parse_args_finds(function_name, args, substring,
11388 start, end)) {
11389 if (ensure_unicode(*substring) < 0)
11390 return 0;
11391 return 1;
11392 }
11393 return 0;
11394}
11395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011396PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011397 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011399Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011400string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
11403static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011404unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011406 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011407 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011408 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011410 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 void *buf1, *buf2;
11412 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011414 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 kind1 = PyUnicode_KIND(self);
11418 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011419 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011420 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 len1 = PyUnicode_GET_LENGTH(self);
11423 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011427
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011428 buf1 = PyUnicode_DATA(self);
11429 buf2 = PyUnicode_DATA(substring);
11430 if (kind2 != kind1) {
11431 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011432 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011433 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011434 }
11435 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 case PyUnicode_1BYTE_KIND:
11437 iresult = ucs1lib_count(
11438 ((Py_UCS1*)buf1) + start, end - start,
11439 buf2, len2, PY_SSIZE_T_MAX
11440 );
11441 break;
11442 case PyUnicode_2BYTE_KIND:
11443 iresult = ucs2lib_count(
11444 ((Py_UCS2*)buf1) + start, end - start,
11445 buf2, len2, PY_SSIZE_T_MAX
11446 );
11447 break;
11448 case PyUnicode_4BYTE_KIND:
11449 iresult = ucs4lib_count(
11450 ((Py_UCS4*)buf1) + start, end - start,
11451 buf2, len2, PY_SSIZE_T_MAX
11452 );
11453 break;
11454 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011455 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 }
11457
11458 result = PyLong_FromSsize_t(iresult);
11459
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011460 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 return result;
11464}
11465
INADA Naoki3ae20562017-01-16 20:41:20 +090011466/*[clinic input]
11467str.encode as unicode_encode
11468
11469 encoding: str(c_default="NULL") = 'utf-8'
11470 The encoding in which to encode the string.
11471 errors: str(c_default="NULL") = 'strict'
11472 The error handling scheme to use for encoding errors.
11473 The default is 'strict' meaning that encoding errors raise a
11474 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11475 'xmlcharrefreplace' as well as any other name registered with
11476 codecs.register_error that can handle UnicodeEncodeErrors.
11477
11478Encode the string using the codec registered for encoding.
11479[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
11481static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011482unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011483/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011485 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011486}
11487
INADA Naoki3ae20562017-01-16 20:41:20 +090011488/*[clinic input]
11489str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
INADA Naoki3ae20562017-01-16 20:41:20 +090011491 tabsize: int = 8
11492
11493Return a copy where all tab characters are expanded using spaces.
11494
11495If tabsize is not given, a tab size of 8 characters is assumed.
11496[clinic start generated code]*/
11497
11498static PyObject *
11499unicode_expandtabs_impl(PyObject *self, int tabsize)
11500/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011502 Py_ssize_t i, j, line_pos, src_len, incr;
11503 Py_UCS4 ch;
11504 PyObject *u;
11505 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011506 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011507 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Antoine Pitrou22425222011-10-04 19:10:51 +020011509 if (PyUnicode_READY(self) == -1)
11510 return NULL;
11511
Thomas Wouters7e474022000-07-16 12:04:32 +000011512 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 src_len = PyUnicode_GET_LENGTH(self);
11514 i = j = line_pos = 0;
11515 kind = PyUnicode_KIND(self);
11516 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011517 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011518 for (; i < src_len; i++) {
11519 ch = PyUnicode_READ(kind, src_data, i);
11520 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011521 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011523 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011525 goto overflow;
11526 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011528 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011532 goto overflow;
11533 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 if (ch == '\n' || ch == '\r')
11536 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011538 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011539 if (!found)
11540 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011541
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011543 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 if (!u)
11545 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Antoine Pitroue71d5742011-10-04 15:55:09 +020011548 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Antoine Pitroue71d5742011-10-04 15:55:09 +020011550 for (; i < src_len; i++) {
11551 ch = PyUnicode_READ(kind, src_data, i);
11552 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011554 incr = tabsize - (line_pos % tabsize);
11555 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011556 FILL(kind, dest_data, ' ', j, incr);
11557 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011559 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 line_pos++;
11562 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011563 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011564 if (ch == '\n' || ch == '\r')
11565 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011567 }
11568 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011569 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011570
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011572 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578\n\
11579Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011580such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581arguments start and end are interpreted as in slice notation.\n\
11582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
11585static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011588 /* initialize variables to prevent gcc warning */
11589 PyObject *substring = NULL;
11590 Py_ssize_t start = 0;
11591 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011592 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011594 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011597 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011600 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 if (result == -2)
11603 return NULL;
11604
Christian Heimes217cfd12007-12-02 14:31:20 +000011605 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
11608static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011609unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011611 void *data;
11612 enum PyUnicode_Kind kind;
11613 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011614
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011615 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011616 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011618 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011619 if (PyUnicode_READY(self) == -1) {
11620 return NULL;
11621 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011622 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11623 PyErr_SetString(PyExc_IndexError, "string index out of range");
11624 return NULL;
11625 }
11626 kind = PyUnicode_KIND(self);
11627 data = PyUnicode_DATA(self);
11628 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011629 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630}
11631
Guido van Rossumc2504932007-09-18 19:42:40 +000011632/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011633 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011634static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011635unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636{
Guido van Rossumc2504932007-09-18 19:42:40 +000011637 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011638 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011639
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011640#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011641 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011642#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (_PyUnicode_HASH(self) != -1)
11644 return _PyUnicode_HASH(self);
11645 if (PyUnicode_READY(self) == -1)
11646 return -1;
11647 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011648 /*
11649 We make the hash of the empty string be 0, rather than using
11650 (prefix ^ suffix), since this slightly obfuscates the hash secret
11651 */
11652 if (len == 0) {
11653 _PyUnicode_HASH(self) = 0;
11654 return 0;
11655 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011656 x = _Py_HashBytes(PyUnicode_DATA(self),
11657 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011659 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011665Return the lowest index in S where substring sub is found, \n\
11666such that sub is contained within S[start:end]. Optional\n\
11667arguments start and end are interpreted as in slice notation.\n\
11668\n\
11669Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670
11671static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011674 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011675 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011676 PyObject *substring = NULL;
11677 Py_ssize_t start = 0;
11678 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011680 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011683 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011686 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 if (result == -2)
11689 return NULL;
11690
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 if (result < 0) {
11692 PyErr_SetString(PyExc_ValueError, "substring not found");
11693 return NULL;
11694 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011695
Christian Heimes217cfd12007-12-02 14:31:20 +000011696 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697}
11698
INADA Naoki3ae20562017-01-16 20:41:20 +090011699/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011700str.isascii as unicode_isascii
11701
11702Return True if all characters in the string are ASCII, False otherwise.
11703
11704ASCII characters have code points in the range U+0000-U+007F.
11705Empty string is ASCII too.
11706[clinic start generated code]*/
11707
11708static PyObject *
11709unicode_isascii_impl(PyObject *self)
11710/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11711{
11712 if (PyUnicode_READY(self) == -1) {
11713 return NULL;
11714 }
11715 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11716}
11717
11718/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011719str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
INADA Naoki3ae20562017-01-16 20:41:20 +090011721Return True if the string is a lowercase string, False otherwise.
11722
11723A string is lowercase if all cased characters in the string are lowercase and
11724there is at least one cased character in the string.
11725[clinic start generated code]*/
11726
11727static PyObject *
11728unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011729/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 Py_ssize_t i, length;
11732 int kind;
11733 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 int cased;
11735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 if (PyUnicode_READY(self) == -1)
11737 return NULL;
11738 length = PyUnicode_GET_LENGTH(self);
11739 kind = PyUnicode_KIND(self);
11740 data = PyUnicode_DATA(self);
11741
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 if (length == 1)
11744 return PyBool_FromLong(
11745 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011747 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011749 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011750
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 for (i = 0; i < length; i++) {
11753 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011754
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011756 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 else if (!cased && Py_UNICODE_ISLOWER(ch))
11758 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011760 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761}
11762
INADA Naoki3ae20562017-01-16 20:41:20 +090011763/*[clinic input]
11764str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
INADA Naoki3ae20562017-01-16 20:41:20 +090011766Return True if the string is an uppercase string, False otherwise.
11767
11768A string is uppercase if all cased characters in the string are uppercase and
11769there is at least one cased character in the string.
11770[clinic start generated code]*/
11771
11772static PyObject *
11773unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011774/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 Py_ssize_t i, length;
11777 int kind;
11778 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 int cased;
11780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783 length = PyUnicode_GET_LENGTH(self);
11784 kind = PyUnicode_KIND(self);
11785 data = PyUnicode_DATA(self);
11786
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (length == 1)
11789 return PyBool_FromLong(
11790 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011792 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011794 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011795
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 for (i = 0; i < length; i++) {
11798 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011799
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011801 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 else if (!cased && Py_UNICODE_ISUPPER(ch))
11803 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011805 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806}
11807
INADA Naoki3ae20562017-01-16 20:41:20 +090011808/*[clinic input]
11809str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
INADA Naoki3ae20562017-01-16 20:41:20 +090011811Return True if the string is a title-cased string, False otherwise.
11812
11813In a title-cased string, upper- and title-case characters may only
11814follow uncased characters and lowercase characters only cased ones.
11815[clinic start generated code]*/
11816
11817static PyObject *
11818unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011819/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 Py_ssize_t i, length;
11822 int kind;
11823 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 int cased, previous_is_cased;
11825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (PyUnicode_READY(self) == -1)
11827 return NULL;
11828 length = PyUnicode_GET_LENGTH(self);
11829 kind = PyUnicode_KIND(self);
11830 data = PyUnicode_DATA(self);
11831
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 if (length == 1) {
11834 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11835 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11836 (Py_UNICODE_ISUPPER(ch) != 0));
11837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011839 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011841 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011842
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 cased = 0;
11844 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 for (i = 0; i < length; i++) {
11846 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011847
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11849 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011850 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 previous_is_cased = 1;
11852 cased = 1;
11853 }
11854 else if (Py_UNICODE_ISLOWER(ch)) {
11855 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011856 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 previous_is_cased = 1;
11858 cased = 1;
11859 }
11860 else
11861 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011863 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864}
11865
INADA Naoki3ae20562017-01-16 20:41:20 +090011866/*[clinic input]
11867str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868
INADA Naoki3ae20562017-01-16 20:41:20 +090011869Return True if the string is a whitespace string, False otherwise.
11870
11871A string is whitespace if all characters in the string are whitespace and there
11872is at least one character in the string.
11873[clinic start generated code]*/
11874
11875static PyObject *
11876unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011877/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 Py_ssize_t i, length;
11880 int kind;
11881 void *data;
11882
11883 if (PyUnicode_READY(self) == -1)
11884 return NULL;
11885 length = PyUnicode_GET_LENGTH(self);
11886 kind = PyUnicode_KIND(self);
11887 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (length == 1)
11891 return PyBool_FromLong(
11892 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011896 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 for (i = 0; i < length; i++) {
11899 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011900 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011901 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011903 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906/*[clinic input]
11907str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011908
INADA Naoki3ae20562017-01-16 20:41:20 +090011909Return True if the string is an alphabetic string, False otherwise.
11910
11911A string is alphabetic if all characters in the string are alphabetic and there
11912is at least one character in the string.
11913[clinic start generated code]*/
11914
11915static PyObject *
11916unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011917/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 Py_ssize_t i, length;
11920 int kind;
11921 void *data;
11922
11923 if (PyUnicode_READY(self) == -1)
11924 return NULL;
11925 length = PyUnicode_GET_LENGTH(self);
11926 kind = PyUnicode_KIND(self);
11927 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011928
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 if (length == 1)
11931 return PyBool_FromLong(
11932 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011933
11934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < length; i++) {
11939 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011942 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011943}
11944
INADA Naoki3ae20562017-01-16 20:41:20 +090011945/*[clinic input]
11946str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011947
INADA Naoki3ae20562017-01-16 20:41:20 +090011948Return True if the string is an alpha-numeric string, False otherwise.
11949
11950A string is alpha-numeric if all characters in the string are alpha-numeric and
11951there is at least one character in the string.
11952[clinic start generated code]*/
11953
11954static PyObject *
11955unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011956/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 int kind;
11959 void *data;
11960 Py_ssize_t len, i;
11961
11962 if (PyUnicode_READY(self) == -1)
11963 return NULL;
11964
11965 kind = PyUnicode_KIND(self);
11966 data = PyUnicode_DATA(self);
11967 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011968
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011969 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (len == 1) {
11971 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11972 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11973 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011974
11975 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011977 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 for (i = 0; i < len; i++) {
11980 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011981 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011983 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011984 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985}
11986
INADA Naoki3ae20562017-01-16 20:41:20 +090011987/*[clinic input]
11988str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
INADA Naoki3ae20562017-01-16 20:41:20 +090011990Return True if the string is a decimal string, False otherwise.
11991
11992A string is a decimal string if all characters in the string are decimal and
11993there is at least one character in the string.
11994[clinic start generated code]*/
11995
11996static PyObject *
11997unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011998/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 Py_ssize_t i, length;
12001 int kind;
12002 void *data;
12003
12004 if (PyUnicode_READY(self) == -1)
12005 return NULL;
12006 length = PyUnicode_GET_LENGTH(self);
12007 kind = PyUnicode_KIND(self);
12008 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if (length == 1)
12012 return PyBool_FromLong(
12013 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012015 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012017 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 for (i = 0; i < length; i++) {
12020 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012021 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012023 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024}
12025
INADA Naoki3ae20562017-01-16 20:41:20 +090012026/*[clinic input]
12027str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
INADA Naoki3ae20562017-01-16 20:41:20 +090012029Return True if the string is a digit string, False otherwise.
12030
12031A string is a digit string if all characters in the string are digits and there
12032is at least one character in the string.
12033[clinic start generated code]*/
12034
12035static PyObject *
12036unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012037/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 Py_ssize_t i, length;
12040 int kind;
12041 void *data;
12042
12043 if (PyUnicode_READY(self) == -1)
12044 return NULL;
12045 length = PyUnicode_GET_LENGTH(self);
12046 kind = PyUnicode_KIND(self);
12047 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 if (length == 1) {
12051 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12052 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012055 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012057 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 for (i = 0; i < length; i++) {
12060 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064}
12065
INADA Naoki3ae20562017-01-16 20:41:20 +090012066/*[clinic input]
12067str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
INADA Naoki3ae20562017-01-16 20:41:20 +090012069Return True if the string is a numeric string, False otherwise.
12070
12071A string is numeric if all characters in the string are numeric and there is at
12072least one character in the string.
12073[clinic start generated code]*/
12074
12075static PyObject *
12076unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012077/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 Py_ssize_t i, length;
12080 int kind;
12081 void *data;
12082
12083 if (PyUnicode_READY(self) == -1)
12084 return NULL;
12085 length = PyUnicode_GET_LENGTH(self);
12086 kind = PyUnicode_KIND(self);
12087 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 if (length == 1)
12091 return PyBool_FromLong(
12092 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012094 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012096 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 for (i = 0; i < length; i++) {
12099 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012100 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012102 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103}
12104
Martin v. Löwis47383402007-08-15 07:32:56 +000012105int
12106PyUnicode_IsIdentifier(PyObject *self)
12107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 int kind;
12109 void *data;
12110 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012111 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 if (PyUnicode_READY(self) == -1) {
12114 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 }
12117
12118 /* Special case for empty strings */
12119 if (PyUnicode_GET_LENGTH(self) == 0)
12120 return 0;
12121 kind = PyUnicode_KIND(self);
12122 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012123
12124 /* PEP 3131 says that the first character must be in
12125 XID_Start and subsequent characters in XID_Continue,
12126 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012128 letters, digits, underscore). However, given the current
12129 definition of XID_Start and XID_Continue, it is sufficient
12130 to check just for these, except that _ must be allowed
12131 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012133 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012134 return 0;
12135
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012136 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012139 return 1;
12140}
12141
INADA Naoki3ae20562017-01-16 20:41:20 +090012142/*[clinic input]
12143str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012144
INADA Naoki3ae20562017-01-16 20:41:20 +090012145Return True if the string is a valid Python identifier, False otherwise.
12146
12147Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12148"class".
12149[clinic start generated code]*/
12150
12151static PyObject *
12152unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012153/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012154{
12155 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12156}
12157
INADA Naoki3ae20562017-01-16 20:41:20 +090012158/*[clinic input]
12159str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161Return True if the string is printable, False otherwise.
12162
12163A string is printable if all of its characters are considered printable in
12164repr() or if it is empty.
12165[clinic start generated code]*/
12166
12167static PyObject *
12168unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012169/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 Py_ssize_t i, length;
12172 int kind;
12173 void *data;
12174
12175 if (PyUnicode_READY(self) == -1)
12176 return NULL;
12177 length = PyUnicode_GET_LENGTH(self);
12178 kind = PyUnicode_KIND(self);
12179 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012180
12181 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 if (length == 1)
12183 return PyBool_FromLong(
12184 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 for (i = 0; i < length; i++) {
12187 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012188 Py_RETURN_FALSE;
12189 }
12190 }
12191 Py_RETURN_TRUE;
12192}
12193
INADA Naoki3ae20562017-01-16 20:41:20 +090012194/*[clinic input]
12195str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196
INADA Naoki3ae20562017-01-16 20:41:20 +090012197 iterable: object
12198 /
12199
12200Concatenate any number of strings.
12201
Martin Panter91a88662017-01-24 00:30:06 +000012202The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012203The result is returned as a new string.
12204
12205Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12206[clinic start generated code]*/
12207
12208static PyObject *
12209unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012210/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211{
INADA Naoki3ae20562017-01-16 20:41:20 +090012212 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213}
12214
Martin v. Löwis18e16552006-02-15 17:27:45 +000012215static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012216unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 if (PyUnicode_READY(self) == -1)
12219 return -1;
12220 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221}
12222
INADA Naoki3ae20562017-01-16 20:41:20 +090012223/*[clinic input]
12224str.ljust as unicode_ljust
12225
12226 width: Py_ssize_t
12227 fillchar: Py_UCS4 = ' '
12228 /
12229
12230Return a left-justified string of length width.
12231
12232Padding is done using the specified fill character (default is a space).
12233[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234
12235static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012236unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12237/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012239 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
Victor Stinnerc4b49542011-12-11 22:44:26 +010012242 if (PyUnicode_GET_LENGTH(self) >= width)
12243 return unicode_result_unchanged(self);
12244
12245 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246}
12247
INADA Naoki3ae20562017-01-16 20:41:20 +090012248/*[clinic input]
12249str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250
INADA Naoki3ae20562017-01-16 20:41:20 +090012251Return a copy of the string converted to lowercase.
12252[clinic start generated code]*/
12253
12254static PyObject *
12255unicode_lower_impl(PyObject *self)
12256/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012258 if (PyUnicode_READY(self) == -1)
12259 return NULL;
12260 if (PyUnicode_IS_ASCII(self))
12261 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012262 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263}
12264
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012265#define LEFTSTRIP 0
12266#define RIGHTSTRIP 1
12267#define BOTHSTRIP 2
12268
12269/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012270static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012271
INADA Naoki3ae20562017-01-16 20:41:20 +090012272#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012273
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012274/* externally visible for str.strip(unicode) */
12275PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012276_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 void *data;
12279 int kind;
12280 Py_ssize_t i, j, len;
12281 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012282 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12285 return NULL;
12286
12287 kind = PyUnicode_KIND(self);
12288 data = PyUnicode_DATA(self);
12289 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012290 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12292 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012293 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294
Benjamin Peterson14339b62009-01-31 16:36:08 +000012295 i = 0;
12296 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012297 while (i < len) {
12298 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12299 if (!BLOOM(sepmask, ch))
12300 break;
12301 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12302 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 i++;
12304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012305 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012306
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 j = len;
12308 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 j--;
12310 while (j >= i) {
12311 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12312 if (!BLOOM(sepmask, ch))
12313 break;
12314 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12315 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012317 }
12318
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012321
Victor Stinner7931d9a2011-11-04 00:22:48 +010012322 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323}
12324
12325PyObject*
12326PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12327{
12328 unsigned char *data;
12329 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012330 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331
Victor Stinnerde636f32011-10-01 03:55:54 +020012332 if (PyUnicode_READY(self) == -1)
12333 return NULL;
12334
Victor Stinner684d5fd2012-05-03 02:32:34 +020012335 length = PyUnicode_GET_LENGTH(self);
12336 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012337
Victor Stinner684d5fd2012-05-03 02:32:34 +020012338 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012339 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340
Victor Stinnerde636f32011-10-01 03:55:54 +020012341 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012342 PyErr_SetString(PyExc_IndexError, "string index out of range");
12343 return NULL;
12344 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012345 if (start >= length || end < start)
12346 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012347
Victor Stinner684d5fd2012-05-03 02:32:34 +020012348 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012349 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012350 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012351 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012352 }
12353 else {
12354 kind = PyUnicode_KIND(self);
12355 data = PyUnicode_1BYTE_DATA(self);
12356 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012357 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012358 length);
12359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
12362static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012363do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 Py_ssize_t len, i, j;
12366
12367 if (PyUnicode_READY(self) == -1)
12368 return NULL;
12369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012371
Victor Stinnercc7af722013-04-09 22:39:24 +020012372 if (PyUnicode_IS_ASCII(self)) {
12373 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12374
12375 i = 0;
12376 if (striptype != RIGHTSTRIP) {
12377 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012378 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012379 if (!_Py_ascii_whitespace[ch])
12380 break;
12381 i++;
12382 }
12383 }
12384
12385 j = len;
12386 if (striptype != LEFTSTRIP) {
12387 j--;
12388 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012389 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012390 if (!_Py_ascii_whitespace[ch])
12391 break;
12392 j--;
12393 }
12394 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 }
12396 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012397 else {
12398 int kind = PyUnicode_KIND(self);
12399 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400
Victor Stinnercc7af722013-04-09 22:39:24 +020012401 i = 0;
12402 if (striptype != RIGHTSTRIP) {
12403 while (i < len) {
12404 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12405 if (!Py_UNICODE_ISSPACE(ch))
12406 break;
12407 i++;
12408 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012409 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012410
12411 j = len;
12412 if (striptype != LEFTSTRIP) {
12413 j--;
12414 while (j >= i) {
12415 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12416 if (!Py_UNICODE_ISSPACE(ch))
12417 break;
12418 j--;
12419 }
12420 j++;
12421 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423
Victor Stinner7931d9a2011-11-04 00:22:48 +010012424 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425}
12426
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012427
12428static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012429do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012430{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012431 if (sep != NULL && sep != Py_None) {
12432 if (PyUnicode_Check(sep))
12433 return _PyUnicode_XStrip(self, striptype, sep);
12434 else {
12435 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 "%s arg must be None or str",
12437 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 return NULL;
12439 }
12440 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012441
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443}
12444
12445
INADA Naoki3ae20562017-01-16 20:41:20 +090012446/*[clinic input]
12447str.strip as unicode_strip
12448
12449 chars: object = None
12450 /
12451
Victor Stinner0c4a8282017-01-17 02:21:47 +010012452Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012453
12454If chars is given and not None, remove characters in chars instead.
12455[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456
12457static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012458unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012459/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460{
INADA Naoki3ae20562017-01-16 20:41:20 +090012461 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462}
12463
12464
INADA Naoki3ae20562017-01-16 20:41:20 +090012465/*[clinic input]
12466str.lstrip as unicode_lstrip
12467
12468 chars: object = NULL
12469 /
12470
12471Return a copy of the string with leading whitespace removed.
12472
12473If chars is given and not None, remove characters in chars instead.
12474[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012475
12476static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012477unicode_lstrip_impl(PyObject *self, PyObject *chars)
12478/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479{
INADA Naoki3ae20562017-01-16 20:41:20 +090012480 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481}
12482
12483
INADA Naoki3ae20562017-01-16 20:41:20 +090012484/*[clinic input]
12485str.rstrip as unicode_rstrip
12486
12487 chars: object = NULL
12488 /
12489
12490Return a copy of the string with trailing whitespace removed.
12491
12492If chars is given and not None, remove characters in chars instead.
12493[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494
12495static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012496unicode_rstrip_impl(PyObject *self, PyObject *chars)
12497/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498{
INADA Naoki3ae20562017-01-16 20:41:20 +090012499 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500}
12501
12502
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012504unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012506 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
Serhiy Storchaka05997252013-01-26 12:14:02 +020012509 if (len < 1)
12510 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
Victor Stinnerc4b49542011-12-11 22:44:26 +010012512 /* no repeat, return original string */
12513 if (len == 1)
12514 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012515
Benjamin Petersonbac79492012-01-14 13:34:47 -050012516 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 return NULL;
12518
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012519 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012520 PyErr_SetString(PyExc_OverflowError,
12521 "repeated string is too long");
12522 return NULL;
12523 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012525
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527 if (!u)
12528 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012529 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 if (PyUnicode_GET_LENGTH(str) == 1) {
12532 const int kind = PyUnicode_KIND(str);
12533 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012534 if (kind == PyUnicode_1BYTE_KIND) {
12535 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012536 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012537 }
12538 else if (kind == PyUnicode_2BYTE_KIND) {
12539 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012540 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012541 ucs2[n] = fill_char;
12542 } else {
12543 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12544 assert(kind == PyUnicode_4BYTE_KIND);
12545 for (n = 0; n < len; ++n)
12546 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 }
12549 else {
12550 /* number of characters copied this far */
12551 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012552 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012554 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012556 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012558 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 }
12562
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012563 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012564 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565}
12566
Alexander Belopolsky40018472011-02-26 01:02:56 +000012567PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012568PyUnicode_Replace(PyObject *str,
12569 PyObject *substr,
12570 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012571 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012573 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12574 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012576 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
INADA Naoki3ae20562017-01-16 20:41:20 +090012579/*[clinic input]
12580str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581
INADA Naoki3ae20562017-01-16 20:41:20 +090012582 old: unicode
12583 new: unicode
12584 count: Py_ssize_t = -1
12585 Maximum number of occurrences to replace.
12586 -1 (the default value) means replace all occurrences.
12587 /
12588
12589Return a copy with all occurrences of substring old replaced by new.
12590
12591If the optional argument count is given, only the first count occurrences are
12592replaced.
12593[clinic start generated code]*/
12594
12595static PyObject *
12596unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12597 Py_ssize_t count)
12598/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012600 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012602 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603}
12604
Alexander Belopolsky40018472011-02-26 01:02:56 +000012605static PyObject *
12606unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012608 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 Py_ssize_t isize;
12610 Py_ssize_t osize, squote, dquote, i, o;
12611 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012612 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012616 return NULL;
12617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 isize = PyUnicode_GET_LENGTH(unicode);
12619 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 /* Compute length of output, quote characters, and
12622 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012623 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 max = 127;
12625 squote = dquote = 0;
12626 ikind = PyUnicode_KIND(unicode);
12627 for (i = 0; i < isize; i++) {
12628 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012629 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012631 case '\'': squote++; break;
12632 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012634 incr = 2;
12635 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 default:
12637 /* Fast-path ASCII */
12638 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012639 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012641 ;
12642 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012645 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012647 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 if (osize > PY_SSIZE_T_MAX - incr) {
12652 PyErr_SetString(PyExc_OverflowError,
12653 "string is too long to generate repr");
12654 return NULL;
12655 }
12656 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 }
12658
12659 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012660 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012662 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 if (dquote)
12664 /* Both squote and dquote present. Use squote,
12665 and escape them */
12666 osize += squote;
12667 else
12668 quote = '"';
12669 }
Victor Stinner55c08782013-04-14 18:45:39 +020012670 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671
12672 repr = PyUnicode_New(osize, max);
12673 if (repr == NULL)
12674 return NULL;
12675 okind = PyUnicode_KIND(repr);
12676 odata = PyUnicode_DATA(repr);
12677
12678 PyUnicode_WRITE(okind, odata, 0, quote);
12679 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012680 if (unchanged) {
12681 _PyUnicode_FastCopyCharacters(repr, 1,
12682 unicode, 0,
12683 isize);
12684 }
12685 else {
12686 for (i = 0, o = 1; i < isize; i++) {
12687 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688
Victor Stinner55c08782013-04-14 18:45:39 +020012689 /* Escape quotes and backslashes */
12690 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012691 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012693 continue;
12694 }
12695
12696 /* Map special whitespace to '\t', \n', '\r' */
12697 if (ch == '\t') {
12698 PyUnicode_WRITE(okind, odata, o++, '\\');
12699 PyUnicode_WRITE(okind, odata, o++, 't');
12700 }
12701 else if (ch == '\n') {
12702 PyUnicode_WRITE(okind, odata, o++, '\\');
12703 PyUnicode_WRITE(okind, odata, o++, 'n');
12704 }
12705 else if (ch == '\r') {
12706 PyUnicode_WRITE(okind, odata, o++, '\\');
12707 PyUnicode_WRITE(okind, odata, o++, 'r');
12708 }
12709
12710 /* Map non-printable US ASCII to '\xhh' */
12711 else if (ch < ' ' || ch == 0x7F) {
12712 PyUnicode_WRITE(okind, odata, o++, '\\');
12713 PyUnicode_WRITE(okind, odata, o++, 'x');
12714 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12715 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12716 }
12717
12718 /* Copy ASCII characters as-is */
12719 else if (ch < 0x7F) {
12720 PyUnicode_WRITE(okind, odata, o++, ch);
12721 }
12722
12723 /* Non-ASCII characters */
12724 else {
12725 /* Map Unicode whitespace and control characters
12726 (categories Z* and C* except ASCII space)
12727 */
12728 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12729 PyUnicode_WRITE(okind, odata, o++, '\\');
12730 /* Map 8-bit characters to '\xhh' */
12731 if (ch <= 0xff) {
12732 PyUnicode_WRITE(okind, odata, o++, 'x');
12733 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12735 }
12736 /* Map 16-bit characters to '\uxxxx' */
12737 else if (ch <= 0xffff) {
12738 PyUnicode_WRITE(okind, odata, o++, 'u');
12739 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12740 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12743 }
12744 /* Map 21-bit characters to '\U00xxxxxx' */
12745 else {
12746 PyUnicode_WRITE(okind, odata, o++, 'U');
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12748 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12755 }
12756 }
12757 /* Copy characters as-is */
12758 else {
12759 PyUnicode_WRITE(okind, odata, o++, ch);
12760 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012761 }
12762 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012765 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012766 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767}
12768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012769PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771\n\
12772Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012773such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774arguments start and end are interpreted as in slice notation.\n\
12775\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012776Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777
12778static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012781 /* initialize variables to prevent gcc warning */
12782 PyObject *substring = NULL;
12783 Py_ssize_t start = 0;
12784 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012785 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012787 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012790 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012793 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 if (result == -2)
12796 return NULL;
12797
Christian Heimes217cfd12007-12-02 14:31:20 +000012798 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012801PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012804Return the highest index in S where substring sub is found,\n\
12805such that sub is contained within S[start:end]. Optional\n\
12806arguments start and end are interpreted as in slice notation.\n\
12807\n\
12808Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
12810static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012813 /* initialize variables to prevent gcc warning */
12814 PyObject *substring = NULL;
12815 Py_ssize_t start = 0;
12816 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012819 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012822 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012825 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 if (result == -2)
12828 return NULL;
12829
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830 if (result < 0) {
12831 PyErr_SetString(PyExc_ValueError, "substring not found");
12832 return NULL;
12833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834
Christian Heimes217cfd12007-12-02 14:31:20 +000012835 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836}
12837
INADA Naoki3ae20562017-01-16 20:41:20 +090012838/*[clinic input]
12839str.rjust as unicode_rjust
12840
12841 width: Py_ssize_t
12842 fillchar: Py_UCS4 = ' '
12843 /
12844
12845Return a right-justified string of length width.
12846
12847Padding is done using the specified fill character (default is a space).
12848[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849
12850static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012851unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12852/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012854 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855 return NULL;
12856
Victor Stinnerc4b49542011-12-11 22:44:26 +010012857 if (PyUnicode_GET_LENGTH(self) >= width)
12858 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859
Victor Stinnerc4b49542011-12-11 22:44:26 +010012860 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861}
12862
Alexander Belopolsky40018472011-02-26 01:02:56 +000012863PyObject *
12864PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012866 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012869 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870}
12871
INADA Naoki3ae20562017-01-16 20:41:20 +090012872/*[clinic input]
12873str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
INADA Naoki3ae20562017-01-16 20:41:20 +090012875 sep: object = None
12876 The delimiter according which to split the string.
12877 None (the default value) means split according to any whitespace,
12878 and discard empty strings from the result.
12879 maxsplit: Py_ssize_t = -1
12880 Maximum number of splits to do.
12881 -1 (the default value) means no limit.
12882
12883Return a list of the words in the string, using sep as the delimiter string.
12884[clinic start generated code]*/
12885
12886static PyObject *
12887unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12888/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889{
INADA Naoki3ae20562017-01-16 20:41:20 +090012890 if (sep == Py_None)
12891 return split(self, NULL, maxsplit);
12892 if (PyUnicode_Check(sep))
12893 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012894
12895 PyErr_Format(PyExc_TypeError,
12896 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012897 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899}
12900
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012902PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012904 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012905 int kind1, kind2;
12906 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012909 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012910 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911
Victor Stinner14f8f022011-10-05 20:58:25 +020012912 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 len1 = PyUnicode_GET_LENGTH(str_obj);
12915 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012916 if (kind1 < kind2 || len1 < len2) {
12917 _Py_INCREF_UNICODE_EMPTY();
12918 if (!unicode_empty)
12919 out = NULL;
12920 else {
12921 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12922 Py_DECREF(unicode_empty);
12923 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012924 return out;
12925 }
12926 buf1 = PyUnicode_DATA(str_obj);
12927 buf2 = PyUnicode_DATA(sep_obj);
12928 if (kind2 != kind1) {
12929 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12930 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012934 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012936 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12937 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938 else
12939 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 break;
12941 case PyUnicode_2BYTE_KIND:
12942 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943 break;
12944 case PyUnicode_4BYTE_KIND:
12945 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12946 break;
12947 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012948 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012950
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012953
12954 return out;
12955}
12956
12957
12958PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012959PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012960{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012961 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012962 int kind1, kind2;
12963 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012965
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012966 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012969 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 len1 = PyUnicode_GET_LENGTH(str_obj);
12972 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012973 if (kind1 < kind2 || len1 < len2) {
12974 _Py_INCREF_UNICODE_EMPTY();
12975 if (!unicode_empty)
12976 out = NULL;
12977 else {
12978 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12979 Py_DECREF(unicode_empty);
12980 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012981 return out;
12982 }
12983 buf1 = PyUnicode_DATA(str_obj);
12984 buf2 = PyUnicode_DATA(sep_obj);
12985 if (kind2 != kind1) {
12986 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12987 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012988 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012991 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012993 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12994 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12995 else
12996 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 break;
12998 case PyUnicode_2BYTE_KIND:
12999 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13000 break;
13001 case PyUnicode_4BYTE_KIND:
13002 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13003 break;
13004 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013005 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013007
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013010
13011 return out;
13012}
13013
INADA Naoki3ae20562017-01-16 20:41:20 +090013014/*[clinic input]
13015str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013016
INADA Naoki3ae20562017-01-16 20:41:20 +090013017 sep: object
13018 /
13019
13020Partition the string into three parts using the given separator.
13021
13022This will search for the separator in the string. If the separator is found,
13023returns a 3-tuple containing the part before the separator, the separator
13024itself, and the part after it.
13025
13026If the separator is not found, returns a 3-tuple containing the original string
13027and two empty strings.
13028[clinic start generated code]*/
13029
13030static PyObject *
13031unicode_partition(PyObject *self, PyObject *sep)
13032/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033{
INADA Naoki3ae20562017-01-16 20:41:20 +090013034 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035}
13036
INADA Naoki3ae20562017-01-16 20:41:20 +090013037/*[clinic input]
13038str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013039
INADA Naoki3ae20562017-01-16 20:41:20 +090013040Partition the string into three parts using the given separator.
13041
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013042This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013043the separator is found, returns a 3-tuple containing the part before the
13044separator, the separator itself, and the part after it.
13045
13046If the separator is not found, returns a 3-tuple containing two empty strings
13047and the original string.
13048[clinic start generated code]*/
13049
13050static PyObject *
13051unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013052/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053{
INADA Naoki3ae20562017-01-16 20:41:20 +090013054 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055}
13056
Alexander Belopolsky40018472011-02-26 01:02:56 +000013057PyObject *
13058PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013059{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013060 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013062
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013063 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013064}
13065
INADA Naoki3ae20562017-01-16 20:41:20 +090013066/*[clinic input]
13067str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013068
INADA Naoki3ae20562017-01-16 20:41:20 +090013069Return a list of the words in the string, using sep as the delimiter string.
13070
13071Splits are done starting at the end of the string and working to the front.
13072[clinic start generated code]*/
13073
13074static PyObject *
13075unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13076/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013077{
INADA Naoki3ae20562017-01-16 20:41:20 +090013078 if (sep == Py_None)
13079 return rsplit(self, NULL, maxsplit);
13080 if (PyUnicode_Check(sep))
13081 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082
13083 PyErr_Format(PyExc_TypeError,
13084 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013085 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013086 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013087}
13088
INADA Naoki3ae20562017-01-16 20:41:20 +090013089/*[clinic input]
13090str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013092 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013093
13094Return a list of the lines in the string, breaking at line boundaries.
13095
13096Line breaks are not included in the resulting list unless keepends is given and
13097true.
13098[clinic start generated code]*/
13099
13100static PyObject *
13101unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013102/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013104 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105}
13106
13107static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013108PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013110 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111}
13112
INADA Naoki3ae20562017-01-16 20:41:20 +090013113/*[clinic input]
13114str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
INADA Naoki3ae20562017-01-16 20:41:20 +090013116Convert uppercase characters to lowercase and lowercase characters to uppercase.
13117[clinic start generated code]*/
13118
13119static PyObject *
13120unicode_swapcase_impl(PyObject *self)
13121/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013123 if (PyUnicode_READY(self) == -1)
13124 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013125 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126}
13127
Larry Hastings61272b72014-01-07 12:41:53 -080013128/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013129
Larry Hastings31826802013-10-19 00:09:25 -070013130@staticmethod
13131str.maketrans as unicode_maketrans
13132
13133 x: object
13134
13135 y: unicode=NULL
13136
13137 z: unicode=NULL
13138
13139 /
13140
13141Return a translation table usable for str.translate().
13142
13143If there is only one argument, it must be a dictionary mapping Unicode
13144ordinals (integers) or characters to Unicode ordinals, strings or None.
13145Character keys will be then converted to ordinals.
13146If there are two arguments, they must be strings of equal length, and
13147in the resulting dictionary, each character in x will be mapped to the
13148character at the same position in y. If there is a third argument, it
13149must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013150[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013151
Larry Hastings31826802013-10-19 00:09:25 -070013152static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013153unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013154/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013155{
Georg Brandlceee0772007-11-27 23:48:05 +000013156 PyObject *new = NULL, *key, *value;
13157 Py_ssize_t i = 0;
13158 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159
Georg Brandlceee0772007-11-27 23:48:05 +000013160 new = PyDict_New();
13161 if (!new)
13162 return NULL;
13163 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 int x_kind, y_kind, z_kind;
13165 void *x_data, *y_data, *z_data;
13166
Georg Brandlceee0772007-11-27 23:48:05 +000013167 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013168 if (!PyUnicode_Check(x)) {
13169 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13170 "be a string if there is a second argument");
13171 goto err;
13172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013174 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13175 "arguments must have equal length");
13176 goto err;
13177 }
13178 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 x_kind = PyUnicode_KIND(x);
13180 y_kind = PyUnicode_KIND(y);
13181 x_data = PyUnicode_DATA(x);
13182 y_data = PyUnicode_DATA(y);
13183 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13184 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013185 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013186 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013187 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013188 if (!value) {
13189 Py_DECREF(key);
13190 goto err;
13191 }
Georg Brandlceee0772007-11-27 23:48:05 +000013192 res = PyDict_SetItem(new, key, value);
13193 Py_DECREF(key);
13194 Py_DECREF(value);
13195 if (res < 0)
13196 goto err;
13197 }
13198 /* create entries for deleting chars in z */
13199 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 z_kind = PyUnicode_KIND(z);
13201 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013202 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013204 if (!key)
13205 goto err;
13206 res = PyDict_SetItem(new, key, Py_None);
13207 Py_DECREF(key);
13208 if (res < 0)
13209 goto err;
13210 }
13211 }
13212 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 int kind;
13214 void *data;
13215
Georg Brandlceee0772007-11-27 23:48:05 +000013216 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013217 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013218 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13219 "to maketrans it must be a dict");
13220 goto err;
13221 }
13222 /* copy entries into the new dict, converting string keys to int keys */
13223 while (PyDict_Next(x, &i, &key, &value)) {
13224 if (PyUnicode_Check(key)) {
13225 /* convert string keys to integer keys */
13226 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013227 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013228 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13229 "table must be of length 1");
13230 goto err;
13231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 kind = PyUnicode_KIND(key);
13233 data = PyUnicode_DATA(key);
13234 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013235 if (!newkey)
13236 goto err;
13237 res = PyDict_SetItem(new, newkey, value);
13238 Py_DECREF(newkey);
13239 if (res < 0)
13240 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013241 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013242 /* just keep integer keys */
13243 if (PyDict_SetItem(new, key, value) < 0)
13244 goto err;
13245 } else {
13246 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13247 "be strings or integers");
13248 goto err;
13249 }
13250 }
13251 }
13252 return new;
13253 err:
13254 Py_DECREF(new);
13255 return NULL;
13256}
13257
INADA Naoki3ae20562017-01-16 20:41:20 +090013258/*[clinic input]
13259str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260
INADA Naoki3ae20562017-01-16 20:41:20 +090013261 table: object
13262 Translation table, which must be a mapping of Unicode ordinals to
13263 Unicode ordinals, strings, or None.
13264 /
13265
13266Replace each character in the string using the given translation table.
13267
13268The table must implement lookup/indexing via __getitem__, for instance a
13269dictionary or list. If this operation raises LookupError, the character is
13270left untouched. Characters mapped to None are deleted.
13271[clinic start generated code]*/
13272
13273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013275/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278}
13279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280/*[clinic input]
13281str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282
INADA Naoki3ae20562017-01-16 20:41:20 +090013283Return a copy of the string converted to uppercase.
13284[clinic start generated code]*/
13285
13286static PyObject *
13287unicode_upper_impl(PyObject *self)
13288/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013290 if (PyUnicode_READY(self) == -1)
13291 return NULL;
13292 if (PyUnicode_IS_ASCII(self))
13293 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013294 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295}
13296
INADA Naoki3ae20562017-01-16 20:41:20 +090013297/*[clinic input]
13298str.zfill as unicode_zfill
13299
13300 width: Py_ssize_t
13301 /
13302
13303Pad a numeric string with zeros on the left, to fill a field of the given width.
13304
13305The string is never truncated.
13306[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307
13308static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013309unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013310/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013312 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013313 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 int kind;
13315 void *data;
13316 Py_UCS4 chr;
13317
Benjamin Petersonbac79492012-01-14 13:34:47 -050013318 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320
Victor Stinnerc4b49542011-12-11 22:44:26 +010013321 if (PyUnicode_GET_LENGTH(self) >= width)
13322 return unicode_result_unchanged(self);
13323
13324 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325
13326 u = pad(self, fill, 0, '0');
13327
Walter Dörwald068325e2002-04-15 13:36:47 +000013328 if (u == NULL)
13329 return NULL;
13330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 kind = PyUnicode_KIND(u);
13332 data = PyUnicode_DATA(u);
13333 chr = PyUnicode_READ(kind, data, fill);
13334
13335 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 PyUnicode_WRITE(kind, data, 0, chr);
13338 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339 }
13340
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013341 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013342 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
13345#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013346static PyObject *
13347unicode__decimal2ascii(PyObject *self)
13348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013350}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351#endif
13352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013353PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013356Return True if S starts with the specified prefix, False otherwise.\n\
13357With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013358With optional end, stop comparing S at that position.\n\
13359prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360
13361static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013362unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013363 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013365 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013366 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013367 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013368 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013369 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370
Jesus Ceaac451502011-04-20 17:09:23 +020013371 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013373 if (PyTuple_Check(subobj)) {
13374 Py_ssize_t i;
13375 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013376 substring = PyTuple_GET_ITEM(subobj, i);
13377 if (!PyUnicode_Check(substring)) {
13378 PyErr_Format(PyExc_TypeError,
13379 "tuple for startswith must only contain str, "
13380 "not %.100s",
13381 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013385 if (result == -1)
13386 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013387 if (result) {
13388 Py_RETURN_TRUE;
13389 }
13390 }
13391 /* nothing matched */
13392 Py_RETURN_FALSE;
13393 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013394 if (!PyUnicode_Check(subobj)) {
13395 PyErr_Format(PyExc_TypeError,
13396 "startswith first arg must be str or "
13397 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013399 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013401 if (result == -1)
13402 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404}
13405
13406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013407PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013410Return True if S ends with the specified suffix, False otherwise.\n\
13411With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412With optional end, stop comparing S at that position.\n\
13413suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414
13415static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013416unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013419 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013420 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013421 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013422 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013423 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424
Jesus Ceaac451502011-04-20 17:09:23 +020013425 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013427 if (PyTuple_Check(subobj)) {
13428 Py_ssize_t i;
13429 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013430 substring = PyTuple_GET_ITEM(subobj, i);
13431 if (!PyUnicode_Check(substring)) {
13432 PyErr_Format(PyExc_TypeError,
13433 "tuple for endswith must only contain str, "
13434 "not %.100s",
13435 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013437 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013438 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013439 if (result == -1)
13440 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013441 if (result) {
13442 Py_RETURN_TRUE;
13443 }
13444 }
13445 Py_RETURN_FALSE;
13446 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013447 if (!PyUnicode_Check(subobj)) {
13448 PyErr_Format(PyExc_TypeError,
13449 "endswith first arg must be str or "
13450 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013452 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013453 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013454 if (result == -1)
13455 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013456 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457}
13458
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013459static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013460_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013461{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013462 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13463 writer->data = PyUnicode_DATA(writer->buffer);
13464
13465 if (!writer->readonly) {
13466 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013467 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013468 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013469 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013470 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13471 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13472 writer->kind = PyUnicode_WCHAR_KIND;
13473 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13474
Victor Stinner8f674cc2013-04-17 23:02:17 +020013475 /* Copy-on-write mode: set buffer size to 0 so
13476 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13477 * next write. */
13478 writer->size = 0;
13479 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013480}
13481
Victor Stinnerd3f08822012-05-29 12:57:52 +020013482void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013483_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013484{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013486
13487 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013489
13490 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13491 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13492 writer->kind = PyUnicode_WCHAR_KIND;
13493 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013494}
13495
Victor Stinnerd3f08822012-05-29 12:57:52 +020013496int
13497_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13498 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013499{
13500 Py_ssize_t newlen;
13501 PyObject *newbuffer;
13502
Victor Stinner2740e462016-09-06 16:58:36 -070013503 assert(maxchar <= MAX_UNICODE);
13504
Victor Stinnerca9381e2015-09-22 00:58:32 +020013505 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013506 assert((maxchar > writer->maxchar && length >= 0)
13507 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013508
Victor Stinner202fdca2012-05-07 12:47:02 +020013509 if (length > PY_SSIZE_T_MAX - writer->pos) {
13510 PyErr_NoMemory();
13511 return -1;
13512 }
13513 newlen = writer->pos + length;
13514
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013515 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013518 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013519 if (writer->overallocate
13520 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13521 /* overallocate to limit the number of realloc() */
13522 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013524 if (newlen < writer->min_length)
13525 newlen = writer->min_length;
13526
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527 writer->buffer = PyUnicode_New(newlen, maxchar);
13528 if (writer->buffer == NULL)
13529 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013531 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013532 if (writer->overallocate
13533 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13534 /* overallocate to limit the number of realloc() */
13535 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013537 if (newlen < writer->min_length)
13538 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013539
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013540 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013541 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013542 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013543 newbuffer = PyUnicode_New(newlen, maxchar);
13544 if (newbuffer == NULL)
13545 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13547 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013548 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013549 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 }
13551 else {
13552 newbuffer = resize_compact(writer->buffer, newlen);
13553 if (newbuffer == NULL)
13554 return -1;
13555 }
13556 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013557 }
13558 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013559 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 newbuffer = PyUnicode_New(writer->size, maxchar);
13561 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13564 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013565 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013566 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013568 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013569
13570#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013571}
13572
Victor Stinnerca9381e2015-09-22 00:58:32 +020013573int
13574_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13575 enum PyUnicode_Kind kind)
13576{
13577 Py_UCS4 maxchar;
13578
13579 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13580 assert(writer->kind < kind);
13581
13582 switch (kind)
13583 {
13584 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13585 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13586 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13587 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013588 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013589 }
13590
13591 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13592}
13593
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013594static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013595_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013596{
Victor Stinner2740e462016-09-06 16:58:36 -070013597 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013598 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13599 return -1;
13600 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13601 writer->pos++;
13602 return 0;
13603}
13604
13605int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013606_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13607{
13608 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13609}
13610
13611int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013612_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13613{
13614 Py_UCS4 maxchar;
13615 Py_ssize_t len;
13616
13617 if (PyUnicode_READY(str) == -1)
13618 return -1;
13619 len = PyUnicode_GET_LENGTH(str);
13620 if (len == 0)
13621 return 0;
13622 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13623 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013624 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013625 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013626 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013627 Py_INCREF(str);
13628 writer->buffer = str;
13629 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013630 writer->pos += len;
13631 return 0;
13632 }
13633 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13634 return -1;
13635 }
13636 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13637 str, 0, len);
13638 writer->pos += len;
13639 return 0;
13640}
13641
Victor Stinnere215d962012-10-06 23:03:36 +020013642int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013643_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13644 Py_ssize_t start, Py_ssize_t end)
13645{
13646 Py_UCS4 maxchar;
13647 Py_ssize_t len;
13648
13649 if (PyUnicode_READY(str) == -1)
13650 return -1;
13651
13652 assert(0 <= start);
13653 assert(end <= PyUnicode_GET_LENGTH(str));
13654 assert(start <= end);
13655
13656 if (end == 0)
13657 return 0;
13658
13659 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13660 return _PyUnicodeWriter_WriteStr(writer, str);
13661
13662 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13663 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13664 else
13665 maxchar = writer->maxchar;
13666 len = end - start;
13667
13668 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13669 return -1;
13670
13671 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13672 str, start, len);
13673 writer->pos += len;
13674 return 0;
13675}
13676
13677int
Victor Stinner4a587072013-11-19 12:54:53 +010013678_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13679 const char *ascii, Py_ssize_t len)
13680{
13681 if (len == -1)
13682 len = strlen(ascii);
13683
13684 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13685
13686 if (writer->buffer == NULL && !writer->overallocate) {
13687 PyObject *str;
13688
13689 str = _PyUnicode_FromASCII(ascii, len);
13690 if (str == NULL)
13691 return -1;
13692
13693 writer->readonly = 1;
13694 writer->buffer = str;
13695 _PyUnicodeWriter_Update(writer);
13696 writer->pos += len;
13697 return 0;
13698 }
13699
13700 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13701 return -1;
13702
13703 switch (writer->kind)
13704 {
13705 case PyUnicode_1BYTE_KIND:
13706 {
13707 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13708 Py_UCS1 *data = writer->data;
13709
Christian Heimesf051e432016-09-13 20:22:02 +020013710 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013711 break;
13712 }
13713 case PyUnicode_2BYTE_KIND:
13714 {
13715 _PyUnicode_CONVERT_BYTES(
13716 Py_UCS1, Py_UCS2,
13717 ascii, ascii + len,
13718 (Py_UCS2 *)writer->data + writer->pos);
13719 break;
13720 }
13721 case PyUnicode_4BYTE_KIND:
13722 {
13723 _PyUnicode_CONVERT_BYTES(
13724 Py_UCS1, Py_UCS4,
13725 ascii, ascii + len,
13726 (Py_UCS4 *)writer->data + writer->pos);
13727 break;
13728 }
13729 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013730 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013731 }
13732
13733 writer->pos += len;
13734 return 0;
13735}
13736
13737int
13738_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13739 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013740{
13741 Py_UCS4 maxchar;
13742
13743 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13744 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13745 return -1;
13746 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13747 writer->pos += len;
13748 return 0;
13749}
13750
Victor Stinnerd3f08822012-05-29 12:57:52 +020013751PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013752_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013753{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013754 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013755
Victor Stinnerd3f08822012-05-29 12:57:52 +020013756 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013757 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013758 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013760
13761 str = writer->buffer;
13762 writer->buffer = NULL;
13763
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013764 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013765 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13766 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013767 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013768
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013769 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13770 PyObject *str2;
13771 str2 = resize_compact(str, writer->pos);
13772 if (str2 == NULL) {
13773 Py_DECREF(str);
13774 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013775 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013776 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013777 }
13778
Victor Stinner15a0bd32013-07-08 22:29:55 +020013779 assert(_PyUnicode_CheckConsistency(str, 1));
13780 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013781}
13782
Victor Stinnerd3f08822012-05-29 12:57:52 +020013783void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013784_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013785{
13786 Py_CLEAR(writer->buffer);
13787}
13788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013789#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013790
13791PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013793\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013794Return a formatted version of S, using substitutions from args and kwargs.\n\
13795The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013796
Eric Smith27bbca62010-11-04 17:06:58 +000013797PyDoc_STRVAR(format_map__doc__,
13798 "S.format_map(mapping) -> str\n\
13799\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013800Return a formatted version of S, using substitutions from mapping.\n\
13801The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013802
INADA Naoki3ae20562017-01-16 20:41:20 +090013803/*[clinic input]
13804str.__format__ as unicode___format__
13805
13806 format_spec: unicode
13807 /
13808
13809Return a formatted version of the string as described by format_spec.
13810[clinic start generated code]*/
13811
Eric Smith4a7d76d2008-05-30 18:10:19 +000013812static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013813unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013814/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013815{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013816 _PyUnicodeWriter writer;
13817 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013818
Victor Stinnerd3f08822012-05-29 12:57:52 +020013819 if (PyUnicode_READY(self) == -1)
13820 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013821 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013822 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13823 self, format_spec, 0,
13824 PyUnicode_GET_LENGTH(format_spec));
13825 if (ret == -1) {
13826 _PyUnicodeWriter_Dealloc(&writer);
13827 return NULL;
13828 }
13829 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013830}
13831
INADA Naoki3ae20562017-01-16 20:41:20 +090013832/*[clinic input]
13833str.__sizeof__ as unicode_sizeof
13834
13835Return the size of the string in memory, in bytes.
13836[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013837
13838static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013839unicode_sizeof_impl(PyObject *self)
13840/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013842 Py_ssize_t size;
13843
13844 /* If it's a compact object, account for base structure +
13845 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013846 if (PyUnicode_IS_COMPACT_ASCII(self))
13847 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13848 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013850 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013851 else {
13852 /* If it is a two-block object, account for base object, and
13853 for character block if present. */
13854 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013855 if (_PyUnicode_DATA_ANY(self))
13856 size += (PyUnicode_GET_LENGTH(self) + 1) *
13857 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013858 }
13859 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013860 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013861 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13862 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13863 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13864 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013865
13866 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013867}
13868
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013869static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013870unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013871{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013872 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013873 if (!copy)
13874 return NULL;
13875 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013876}
13877
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013879 UNICODE_ENCODE_METHODDEF
13880 UNICODE_REPLACE_METHODDEF
13881 UNICODE_SPLIT_METHODDEF
13882 UNICODE_RSPLIT_METHODDEF
13883 UNICODE_JOIN_METHODDEF
13884 UNICODE_CAPITALIZE_METHODDEF
13885 UNICODE_CASEFOLD_METHODDEF
13886 UNICODE_TITLE_METHODDEF
13887 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013888 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013889 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013890 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013891 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013892 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013893 UNICODE_LJUST_METHODDEF
13894 UNICODE_LOWER_METHODDEF
13895 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013896 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13897 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013898 UNICODE_RJUST_METHODDEF
13899 UNICODE_RSTRIP_METHODDEF
13900 UNICODE_RPARTITION_METHODDEF
13901 UNICODE_SPLITLINES_METHODDEF
13902 UNICODE_STRIP_METHODDEF
13903 UNICODE_SWAPCASE_METHODDEF
13904 UNICODE_TRANSLATE_METHODDEF
13905 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013906 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13907 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013908 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013909 UNICODE_ISLOWER_METHODDEF
13910 UNICODE_ISUPPER_METHODDEF
13911 UNICODE_ISTITLE_METHODDEF
13912 UNICODE_ISSPACE_METHODDEF
13913 UNICODE_ISDECIMAL_METHODDEF
13914 UNICODE_ISDIGIT_METHODDEF
13915 UNICODE_ISNUMERIC_METHODDEF
13916 UNICODE_ISALPHA_METHODDEF
13917 UNICODE_ISALNUM_METHODDEF
13918 UNICODE_ISIDENTIFIER_METHODDEF
13919 UNICODE_ISPRINTABLE_METHODDEF
13920 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013921 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013922 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013923 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013924 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013925 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013926#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013927 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013928 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929#endif
13930
Benjamin Peterson14339b62009-01-31 16:36:08 +000013931 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932 {NULL, NULL}
13933};
13934
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013935static PyObject *
13936unicode_mod(PyObject *v, PyObject *w)
13937{
Brian Curtindfc80e32011-08-10 20:28:54 -050013938 if (!PyUnicode_Check(v))
13939 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013941}
13942
13943static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013944 0, /*nb_add*/
13945 0, /*nb_subtract*/
13946 0, /*nb_multiply*/
13947 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013948};
13949
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013951 (lenfunc) unicode_length, /* sq_length */
13952 PyUnicode_Concat, /* sq_concat */
13953 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13954 (ssizeargfunc) unicode_getitem, /* sq_item */
13955 0, /* sq_slice */
13956 0, /* sq_ass_item */
13957 0, /* sq_ass_slice */
13958 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959};
13960
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013961static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013962unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013964 if (PyUnicode_READY(self) == -1)
13965 return NULL;
13966
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013967 if (PyIndex_Check(item)) {
13968 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013969 if (i == -1 && PyErr_Occurred())
13970 return NULL;
13971 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013972 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013973 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013974 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013975 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013976 PyObject *result;
13977 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013978 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013979 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013980
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013981 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982 return NULL;
13983 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013984 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13985 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013986
13987 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013988 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013989 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013990 slicelength == PyUnicode_GET_LENGTH(self)) {
13991 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013992 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013993 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013994 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013995 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013996 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013997 src_kind = PyUnicode_KIND(self);
13998 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013999 if (!PyUnicode_IS_ASCII(self)) {
14000 kind_limit = kind_maxchar_limit(src_kind);
14001 max_char = 0;
14002 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14003 ch = PyUnicode_READ(src_kind, src_data, cur);
14004 if (ch > max_char) {
14005 max_char = ch;
14006 if (max_char >= kind_limit)
14007 break;
14008 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014009 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014010 }
Victor Stinner55c99112011-10-13 01:17:06 +020014011 else
14012 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014013 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014014 if (result == NULL)
14015 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014016 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014017 dest_data = PyUnicode_DATA(result);
14018
14019 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014020 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14021 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014022 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014023 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014024 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014025 } else {
14026 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14027 return NULL;
14028 }
14029}
14030
14031static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 (lenfunc)unicode_length, /* mp_length */
14033 (binaryfunc)unicode_subscript, /* mp_subscript */
14034 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014035};
14036
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037
Guido van Rossumd57fd912000-03-10 22:53:23 +000014038/* Helpers for PyUnicode_Format() */
14039
Victor Stinnera47082312012-10-04 02:19:54 +020014040struct unicode_formatter_t {
14041 PyObject *args;
14042 int args_owned;
14043 Py_ssize_t arglen, argidx;
14044 PyObject *dict;
14045
14046 enum PyUnicode_Kind fmtkind;
14047 Py_ssize_t fmtcnt, fmtpos;
14048 void *fmtdata;
14049 PyObject *fmtstr;
14050
14051 _PyUnicodeWriter writer;
14052};
14053
14054struct unicode_format_arg_t {
14055 Py_UCS4 ch;
14056 int flags;
14057 Py_ssize_t width;
14058 int prec;
14059 int sign;
14060};
14061
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014063unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014064{
Victor Stinnera47082312012-10-04 02:19:54 +020014065 Py_ssize_t argidx = ctx->argidx;
14066
14067 if (argidx < ctx->arglen) {
14068 ctx->argidx++;
14069 if (ctx->arglen < 0)
14070 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014071 else
Victor Stinnera47082312012-10-04 02:19:54 +020014072 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073 }
14074 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014075 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014076 return NULL;
14077}
14078
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014079/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080
Victor Stinnera47082312012-10-04 02:19:54 +020014081/* Format a float into the writer if the writer is not NULL, or into *p_output
14082 otherwise.
14083
14084 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014085static int
Victor Stinnera47082312012-10-04 02:19:54 +020014086formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14087 PyObject **p_output,
14088 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014089{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014090 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014091 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014093 int prec;
14094 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014095
Guido van Rossumd57fd912000-03-10 22:53:23 +000014096 x = PyFloat_AsDouble(v);
14097 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014098 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014099
Victor Stinnera47082312012-10-04 02:19:54 +020014100 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014101 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014102 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014103
Victor Stinnera47082312012-10-04 02:19:54 +020014104 if (arg->flags & F_ALT)
14105 dtoa_flags = Py_DTSF_ALT;
14106 else
14107 dtoa_flags = 0;
14108 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014109 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014110 return -1;
14111 len = strlen(p);
14112 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014113 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014114 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014115 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014116 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014117 }
14118 else
14119 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014120 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014121 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014122}
14123
Victor Stinnerd0880d52012-04-27 23:40:13 +020014124/* formatlong() emulates the format codes d, u, o, x and X, and
14125 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14126 * Python's regular ints.
14127 * Return value: a new PyUnicodeObject*, or NULL if error.
14128 * The output string is of the form
14129 * "-"? ("0x" | "0X")? digit+
14130 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14131 * set in flags. The case of hex digits will be correct,
14132 * There will be at least prec digits, zero-filled on the left if
14133 * necessary to get that many.
14134 * val object to be converted
14135 * flags bitmask of format flags; only F_ALT is looked at
14136 * prec minimum number of digits; 0-fill on left if needed
14137 * type a character in [duoxX]; u acts the same as d
14138 *
14139 * CAUTION: o, x and X conversions on regular ints can never
14140 * produce a '-' sign, but can for Python's unbounded ints.
14141 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014142PyObject *
14143_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014144{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014145 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014147 Py_ssize_t i;
14148 int sign; /* 1 if '-', else 0 */
14149 int len; /* number of characters */
14150 Py_ssize_t llen;
14151 int numdigits; /* len == numnondigits + numdigits */
14152 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014153
Victor Stinnerd0880d52012-04-27 23:40:13 +020014154 /* Avoid exceeding SSIZE_T_MAX */
14155 if (prec > INT_MAX-3) {
14156 PyErr_SetString(PyExc_OverflowError,
14157 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159 }
14160
14161 assert(PyLong_Check(val));
14162
14163 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014164 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014165 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014166 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014167 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014168 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014169 /* int and int subclasses should print numerically when a numeric */
14170 /* format code is used (see issue18780) */
14171 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014172 break;
14173 case 'o':
14174 numnondigits = 2;
14175 result = PyNumber_ToBase(val, 8);
14176 break;
14177 case 'x':
14178 case 'X':
14179 numnondigits = 2;
14180 result = PyNumber_ToBase(val, 16);
14181 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 }
14183 if (!result)
14184 return NULL;
14185
14186 assert(unicode_modifiable(result));
14187 assert(PyUnicode_IS_READY(result));
14188 assert(PyUnicode_IS_ASCII(result));
14189
14190 /* To modify the string in-place, there can only be one reference. */
14191 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014192 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014193 PyErr_BadInternalCall();
14194 return NULL;
14195 }
14196 buf = PyUnicode_DATA(result);
14197 llen = PyUnicode_GET_LENGTH(result);
14198 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014199 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014201 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014202 return NULL;
14203 }
14204 len = (int)llen;
14205 sign = buf[0] == '-';
14206 numnondigits += sign;
14207 numdigits = len - numnondigits;
14208 assert(numdigits > 0);
14209
14210 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014211 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014212 (type == 'o' || type == 'x' || type == 'X'))) {
14213 assert(buf[sign] == '0');
14214 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14215 buf[sign+1] == 'o');
14216 numnondigits -= 2;
14217 buf += 2;
14218 len -= 2;
14219 if (sign)
14220 buf[0] = '-';
14221 assert(len == numnondigits + numdigits);
14222 assert(numdigits > 0);
14223 }
14224
14225 /* Fill with leading zeroes to meet minimum width. */
14226 if (prec > numdigits) {
14227 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14228 numnondigits + prec);
14229 char *b1;
14230 if (!r1) {
14231 Py_DECREF(result);
14232 return NULL;
14233 }
14234 b1 = PyBytes_AS_STRING(r1);
14235 for (i = 0; i < numnondigits; ++i)
14236 *b1++ = *buf++;
14237 for (i = 0; i < prec - numdigits; i++)
14238 *b1++ = '0';
14239 for (i = 0; i < numdigits; i++)
14240 *b1++ = *buf++;
14241 *b1 = '\0';
14242 Py_DECREF(result);
14243 result = r1;
14244 buf = PyBytes_AS_STRING(result);
14245 len = numnondigits + prec;
14246 }
14247
14248 /* Fix up case for hex conversions. */
14249 if (type == 'X') {
14250 /* Need to convert all lower case letters to upper case.
14251 and need to convert 0x to 0X (and -0x to -0X). */
14252 for (i = 0; i < len; i++)
14253 if (buf[i] >= 'a' && buf[i] <= 'x')
14254 buf[i] -= 'a'-'A';
14255 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014256 if (!PyUnicode_Check(result)
14257 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014258 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014259 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014260 Py_DECREF(result);
14261 result = unicode;
14262 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014263 else if (len != PyUnicode_GET_LENGTH(result)) {
14264 if (PyUnicode_Resize(&result, len) < 0)
14265 Py_CLEAR(result);
14266 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014268}
14269
Ethan Furmandf3ed242014-01-05 06:50:30 -080014270/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014271 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014272 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014273 * -1 and raise an exception on error */
14274static int
Victor Stinnera47082312012-10-04 02:19:54 +020014275mainformatlong(PyObject *v,
14276 struct unicode_format_arg_t *arg,
14277 PyObject **p_output,
14278 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014279{
14280 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014281 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014282
14283 if (!PyNumber_Check(v))
14284 goto wrongtype;
14285
Ethan Furman9ab74802014-03-21 06:38:46 -070014286 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014288 if (type == 'o' || type == 'x' || type == 'X') {
14289 iobj = PyNumber_Index(v);
14290 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014291 if (PyErr_ExceptionMatches(PyExc_TypeError))
14292 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014293 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014294 }
14295 }
14296 else {
14297 iobj = PyNumber_Long(v);
14298 if (iobj == NULL ) {
14299 if (PyErr_ExceptionMatches(PyExc_TypeError))
14300 goto wrongtype;
14301 return -1;
14302 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014303 }
14304 assert(PyLong_Check(iobj));
14305 }
14306 else {
14307 iobj = v;
14308 Py_INCREF(iobj);
14309 }
14310
14311 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014312 && arg->width == -1 && arg->prec == -1
14313 && !(arg->flags & (F_SIGN | F_BLANK))
14314 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315 {
14316 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014317 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014318 int base;
14319
Victor Stinnera47082312012-10-04 02:19:54 +020014320 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014321 {
14322 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014323 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014324 case 'd':
14325 case 'i':
14326 case 'u':
14327 base = 10;
14328 break;
14329 case 'o':
14330 base = 8;
14331 break;
14332 case 'x':
14333 case 'X':
14334 base = 16;
14335 break;
14336 }
14337
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014338 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14339 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014340 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014341 }
14342 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 return 1;
14344 }
14345
Ethan Furmanb95b5612015-01-23 20:05:18 -080014346 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014347 Py_DECREF(iobj);
14348 if (res == NULL)
14349 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014350 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 return 0;
14352
14353wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014354 switch(type)
14355 {
14356 case 'o':
14357 case 'x':
14358 case 'X':
14359 PyErr_Format(PyExc_TypeError,
14360 "%%%c format: an integer is required, "
14361 "not %.200s",
14362 type, Py_TYPE(v)->tp_name);
14363 break;
14364 default:
14365 PyErr_Format(PyExc_TypeError,
14366 "%%%c format: a number is required, "
14367 "not %.200s",
14368 type, Py_TYPE(v)->tp_name);
14369 break;
14370 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014371 return -1;
14372}
14373
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014374static Py_UCS4
14375formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014376{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014377 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014378 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014379 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014380 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014381 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014382 goto onError;
14383 }
14384 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014385 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014386 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014387 /* make sure number is a type of integer */
14388 if (!PyLong_Check(v)) {
14389 iobj = PyNumber_Index(v);
14390 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014391 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014392 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014393 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014394 Py_DECREF(iobj);
14395 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014396 else {
14397 x = PyLong_AsLong(v);
14398 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014399 if (x == -1 && PyErr_Occurred())
14400 goto onError;
14401
Victor Stinner8faf8212011-12-08 22:14:11 +010014402 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 PyErr_SetString(PyExc_OverflowError,
14404 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014405 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014406 }
14407
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014408 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014409 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014410
Benjamin Peterson29060642009-01-31 22:14:21 +000014411 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014412 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014413 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014414 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014415}
14416
Victor Stinnera47082312012-10-04 02:19:54 +020014417/* Parse options of an argument: flags, width, precision.
14418 Handle also "%(name)" syntax.
14419
14420 Return 0 if the argument has been formatted into arg->str.
14421 Return 1 if the argument has been written into ctx->writer,
14422 Raise an exception and return -1 on error. */
14423static int
14424unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14425 struct unicode_format_arg_t *arg)
14426{
14427#define FORMAT_READ(ctx) \
14428 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14429
14430 PyObject *v;
14431
Victor Stinnera47082312012-10-04 02:19:54 +020014432 if (arg->ch == '(') {
14433 /* Get argument value from a dictionary. Example: "%(name)s". */
14434 Py_ssize_t keystart;
14435 Py_ssize_t keylen;
14436 PyObject *key;
14437 int pcount = 1;
14438
14439 if (ctx->dict == NULL) {
14440 PyErr_SetString(PyExc_TypeError,
14441 "format requires a mapping");
14442 return -1;
14443 }
14444 ++ctx->fmtpos;
14445 --ctx->fmtcnt;
14446 keystart = ctx->fmtpos;
14447 /* Skip over balanced parentheses */
14448 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14449 arg->ch = FORMAT_READ(ctx);
14450 if (arg->ch == ')')
14451 --pcount;
14452 else if (arg->ch == '(')
14453 ++pcount;
14454 ctx->fmtpos++;
14455 }
14456 keylen = ctx->fmtpos - keystart - 1;
14457 if (ctx->fmtcnt < 0 || pcount > 0) {
14458 PyErr_SetString(PyExc_ValueError,
14459 "incomplete format key");
14460 return -1;
14461 }
14462 key = PyUnicode_Substring(ctx->fmtstr,
14463 keystart, keystart + keylen);
14464 if (key == NULL)
14465 return -1;
14466 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014467 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014468 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014469 }
14470 ctx->args = PyObject_GetItem(ctx->dict, key);
14471 Py_DECREF(key);
14472 if (ctx->args == NULL)
14473 return -1;
14474 ctx->args_owned = 1;
14475 ctx->arglen = -1;
14476 ctx->argidx = -2;
14477 }
14478
14479 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014480 while (--ctx->fmtcnt >= 0) {
14481 arg->ch = FORMAT_READ(ctx);
14482 ctx->fmtpos++;
14483 switch (arg->ch) {
14484 case '-': arg->flags |= F_LJUST; continue;
14485 case '+': arg->flags |= F_SIGN; continue;
14486 case ' ': arg->flags |= F_BLANK; continue;
14487 case '#': arg->flags |= F_ALT; continue;
14488 case '0': arg->flags |= F_ZERO; continue;
14489 }
14490 break;
14491 }
14492
14493 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014494 if (arg->ch == '*') {
14495 v = unicode_format_getnextarg(ctx);
14496 if (v == NULL)
14497 return -1;
14498 if (!PyLong_Check(v)) {
14499 PyErr_SetString(PyExc_TypeError,
14500 "* wants int");
14501 return -1;
14502 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014503 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014504 if (arg->width == -1 && PyErr_Occurred())
14505 return -1;
14506 if (arg->width < 0) {
14507 arg->flags |= F_LJUST;
14508 arg->width = -arg->width;
14509 }
14510 if (--ctx->fmtcnt >= 0) {
14511 arg->ch = FORMAT_READ(ctx);
14512 ctx->fmtpos++;
14513 }
14514 }
14515 else if (arg->ch >= '0' && arg->ch <= '9') {
14516 arg->width = arg->ch - '0';
14517 while (--ctx->fmtcnt >= 0) {
14518 arg->ch = FORMAT_READ(ctx);
14519 ctx->fmtpos++;
14520 if (arg->ch < '0' || arg->ch > '9')
14521 break;
14522 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14523 mixing signed and unsigned comparison. Since arg->ch is between
14524 '0' and '9', casting to int is safe. */
14525 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14526 PyErr_SetString(PyExc_ValueError,
14527 "width too big");
14528 return -1;
14529 }
14530 arg->width = arg->width*10 + (arg->ch - '0');
14531 }
14532 }
14533
14534 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014535 if (arg->ch == '.') {
14536 arg->prec = 0;
14537 if (--ctx->fmtcnt >= 0) {
14538 arg->ch = FORMAT_READ(ctx);
14539 ctx->fmtpos++;
14540 }
14541 if (arg->ch == '*') {
14542 v = unicode_format_getnextarg(ctx);
14543 if (v == NULL)
14544 return -1;
14545 if (!PyLong_Check(v)) {
14546 PyErr_SetString(PyExc_TypeError,
14547 "* wants int");
14548 return -1;
14549 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014550 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014551 if (arg->prec == -1 && PyErr_Occurred())
14552 return -1;
14553 if (arg->prec < 0)
14554 arg->prec = 0;
14555 if (--ctx->fmtcnt >= 0) {
14556 arg->ch = FORMAT_READ(ctx);
14557 ctx->fmtpos++;
14558 }
14559 }
14560 else if (arg->ch >= '0' && arg->ch <= '9') {
14561 arg->prec = arg->ch - '0';
14562 while (--ctx->fmtcnt >= 0) {
14563 arg->ch = FORMAT_READ(ctx);
14564 ctx->fmtpos++;
14565 if (arg->ch < '0' || arg->ch > '9')
14566 break;
14567 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14568 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014569 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014570 return -1;
14571 }
14572 arg->prec = arg->prec*10 + (arg->ch - '0');
14573 }
14574 }
14575 }
14576
14577 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14578 if (ctx->fmtcnt >= 0) {
14579 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14580 if (--ctx->fmtcnt >= 0) {
14581 arg->ch = FORMAT_READ(ctx);
14582 ctx->fmtpos++;
14583 }
14584 }
14585 }
14586 if (ctx->fmtcnt < 0) {
14587 PyErr_SetString(PyExc_ValueError,
14588 "incomplete format");
14589 return -1;
14590 }
14591 return 0;
14592
14593#undef FORMAT_READ
14594}
14595
14596/* Format one argument. Supported conversion specifiers:
14597
14598 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014599 - "i", "d", "u": int or float
14600 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014601 - "e", "E", "f", "F", "g", "G": float
14602 - "c": int or str (1 character)
14603
Victor Stinner8dbd4212012-12-04 09:30:24 +010014604 When possible, the output is written directly into the Unicode writer
14605 (ctx->writer). A string is created when padding is required.
14606
Victor Stinnera47082312012-10-04 02:19:54 +020014607 Return 0 if the argument has been formatted into *p_str,
14608 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014609 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014610static int
14611unicode_format_arg_format(struct unicode_formatter_t *ctx,
14612 struct unicode_format_arg_t *arg,
14613 PyObject **p_str)
14614{
14615 PyObject *v;
14616 _PyUnicodeWriter *writer = &ctx->writer;
14617
14618 if (ctx->fmtcnt == 0)
14619 ctx->writer.overallocate = 0;
14620
Victor Stinnera47082312012-10-04 02:19:54 +020014621 v = unicode_format_getnextarg(ctx);
14622 if (v == NULL)
14623 return -1;
14624
Victor Stinnera47082312012-10-04 02:19:54 +020014625
14626 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014627 case 's':
14628 case 'r':
14629 case 'a':
14630 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14631 /* Fast path */
14632 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14633 return -1;
14634 return 1;
14635 }
14636
14637 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14638 *p_str = v;
14639 Py_INCREF(*p_str);
14640 }
14641 else {
14642 if (arg->ch == 's')
14643 *p_str = PyObject_Str(v);
14644 else if (arg->ch == 'r')
14645 *p_str = PyObject_Repr(v);
14646 else
14647 *p_str = PyObject_ASCII(v);
14648 }
14649 break;
14650
14651 case 'i':
14652 case 'd':
14653 case 'u':
14654 case 'o':
14655 case 'x':
14656 case 'X':
14657 {
14658 int ret = mainformatlong(v, arg, p_str, writer);
14659 if (ret != 0)
14660 return ret;
14661 arg->sign = 1;
14662 break;
14663 }
14664
14665 case 'e':
14666 case 'E':
14667 case 'f':
14668 case 'F':
14669 case 'g':
14670 case 'G':
14671 if (arg->width == -1 && arg->prec == -1
14672 && !(arg->flags & (F_SIGN | F_BLANK)))
14673 {
14674 /* Fast path */
14675 if (formatfloat(v, arg, NULL, writer) == -1)
14676 return -1;
14677 return 1;
14678 }
14679
14680 arg->sign = 1;
14681 if (formatfloat(v, arg, p_str, NULL) == -1)
14682 return -1;
14683 break;
14684
14685 case 'c':
14686 {
14687 Py_UCS4 ch = formatchar(v);
14688 if (ch == (Py_UCS4) -1)
14689 return -1;
14690 if (arg->width == -1 && arg->prec == -1) {
14691 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014692 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014693 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014694 return 1;
14695 }
14696 *p_str = PyUnicode_FromOrdinal(ch);
14697 break;
14698 }
14699
14700 default:
14701 PyErr_Format(PyExc_ValueError,
14702 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014703 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014704 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14705 (int)arg->ch,
14706 ctx->fmtpos - 1);
14707 return -1;
14708 }
14709 if (*p_str == NULL)
14710 return -1;
14711 assert (PyUnicode_Check(*p_str));
14712 return 0;
14713}
14714
14715static int
14716unicode_format_arg_output(struct unicode_formatter_t *ctx,
14717 struct unicode_format_arg_t *arg,
14718 PyObject *str)
14719{
14720 Py_ssize_t len;
14721 enum PyUnicode_Kind kind;
14722 void *pbuf;
14723 Py_ssize_t pindex;
14724 Py_UCS4 signchar;
14725 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014726 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014727 Py_ssize_t sublen;
14728 _PyUnicodeWriter *writer = &ctx->writer;
14729 Py_UCS4 fill;
14730
14731 fill = ' ';
14732 if (arg->sign && arg->flags & F_ZERO)
14733 fill = '0';
14734
14735 if (PyUnicode_READY(str) == -1)
14736 return -1;
14737
14738 len = PyUnicode_GET_LENGTH(str);
14739 if ((arg->width == -1 || arg->width <= len)
14740 && (arg->prec == -1 || arg->prec >= len)
14741 && !(arg->flags & (F_SIGN | F_BLANK)))
14742 {
14743 /* Fast path */
14744 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14745 return -1;
14746 return 0;
14747 }
14748
14749 /* Truncate the string for "s", "r" and "a" formats
14750 if the precision is set */
14751 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14752 if (arg->prec >= 0 && len > arg->prec)
14753 len = arg->prec;
14754 }
14755
14756 /* Adjust sign and width */
14757 kind = PyUnicode_KIND(str);
14758 pbuf = PyUnicode_DATA(str);
14759 pindex = 0;
14760 signchar = '\0';
14761 if (arg->sign) {
14762 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14763 if (ch == '-' || ch == '+') {
14764 signchar = ch;
14765 len--;
14766 pindex++;
14767 }
14768 else if (arg->flags & F_SIGN)
14769 signchar = '+';
14770 else if (arg->flags & F_BLANK)
14771 signchar = ' ';
14772 else
14773 arg->sign = 0;
14774 }
14775 if (arg->width < len)
14776 arg->width = len;
14777
14778 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014779 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014780 if (!(arg->flags & F_LJUST)) {
14781 if (arg->sign) {
14782 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014783 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014784 }
14785 else {
14786 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014787 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014788 }
14789 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014790 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14791 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014792 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014793 }
14794
Victor Stinnera47082312012-10-04 02:19:54 +020014795 buflen = arg->width;
14796 if (arg->sign && len == arg->width)
14797 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014798 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014799 return -1;
14800
14801 /* Write the sign if needed */
14802 if (arg->sign) {
14803 if (fill != ' ') {
14804 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14805 writer->pos += 1;
14806 }
14807 if (arg->width > len)
14808 arg->width--;
14809 }
14810
14811 /* Write the numeric prefix for "x", "X" and "o" formats
14812 if the alternate form is used.
14813 For example, write "0x" for the "%#x" format. */
14814 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14815 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14816 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14817 if (fill != ' ') {
14818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14819 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14820 writer->pos += 2;
14821 pindex += 2;
14822 }
14823 arg->width -= 2;
14824 if (arg->width < 0)
14825 arg->width = 0;
14826 len -= 2;
14827 }
14828
14829 /* Pad left with the fill character if needed */
14830 if (arg->width > len && !(arg->flags & F_LJUST)) {
14831 sublen = arg->width - len;
14832 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14833 writer->pos += sublen;
14834 arg->width = len;
14835 }
14836
14837 /* If padding with spaces: write sign if needed and/or numeric prefix if
14838 the alternate form is used */
14839 if (fill == ' ') {
14840 if (arg->sign) {
14841 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14842 writer->pos += 1;
14843 }
14844 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14845 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14846 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14847 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14848 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14849 writer->pos += 2;
14850 pindex += 2;
14851 }
14852 }
14853
14854 /* Write characters */
14855 if (len) {
14856 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14857 str, pindex, len);
14858 writer->pos += len;
14859 }
14860
14861 /* Pad right with the fill character if needed */
14862 if (arg->width > len) {
14863 sublen = arg->width - len;
14864 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14865 writer->pos += sublen;
14866 }
14867 return 0;
14868}
14869
14870/* Helper of PyUnicode_Format(): format one arg.
14871 Return 0 on success, raise an exception and return -1 on error. */
14872static int
14873unicode_format_arg(struct unicode_formatter_t *ctx)
14874{
14875 struct unicode_format_arg_t arg;
14876 PyObject *str;
14877 int ret;
14878
Victor Stinner8dbd4212012-12-04 09:30:24 +010014879 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014880 if (arg.ch == '%') {
14881 ctx->fmtpos++;
14882 ctx->fmtcnt--;
14883 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14884 return -1;
14885 return 0;
14886 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014887 arg.flags = 0;
14888 arg.width = -1;
14889 arg.prec = -1;
14890 arg.sign = 0;
14891 str = NULL;
14892
Victor Stinnera47082312012-10-04 02:19:54 +020014893 ret = unicode_format_arg_parse(ctx, &arg);
14894 if (ret == -1)
14895 return -1;
14896
14897 ret = unicode_format_arg_format(ctx, &arg, &str);
14898 if (ret == -1)
14899 return -1;
14900
14901 if (ret != 1) {
14902 ret = unicode_format_arg_output(ctx, &arg, str);
14903 Py_DECREF(str);
14904 if (ret == -1)
14905 return -1;
14906 }
14907
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014908 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014909 PyErr_SetString(PyExc_TypeError,
14910 "not all arguments converted during string formatting");
14911 return -1;
14912 }
14913 return 0;
14914}
14915
Alexander Belopolsky40018472011-02-26 01:02:56 +000014916PyObject *
14917PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014918{
Victor Stinnera47082312012-10-04 02:19:54 +020014919 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014920
Guido van Rossumd57fd912000-03-10 22:53:23 +000014921 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014922 PyErr_BadInternalCall();
14923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014924 }
Victor Stinnera47082312012-10-04 02:19:54 +020014925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014926 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014927 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014928
14929 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014930 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14931 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14932 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14933 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014934
Victor Stinner8f674cc2013-04-17 23:02:17 +020014935 _PyUnicodeWriter_Init(&ctx.writer);
14936 ctx.writer.min_length = ctx.fmtcnt + 100;
14937 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014938
Guido van Rossumd57fd912000-03-10 22:53:23 +000014939 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014940 ctx.arglen = PyTuple_Size(args);
14941 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014942 }
14943 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014944 ctx.arglen = -1;
14945 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946 }
Victor Stinnera47082312012-10-04 02:19:54 +020014947 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014948 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014949 ctx.dict = args;
14950 else
14951 ctx.dict = NULL;
14952 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953
Victor Stinnera47082312012-10-04 02:19:54 +020014954 while (--ctx.fmtcnt >= 0) {
14955 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014956 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014957
14958 nonfmtpos = ctx.fmtpos++;
14959 while (ctx.fmtcnt >= 0 &&
14960 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14961 ctx.fmtpos++;
14962 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014963 }
Victor Stinnera47082312012-10-04 02:19:54 +020014964 if (ctx.fmtcnt < 0) {
14965 ctx.fmtpos--;
14966 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014967 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014968
Victor Stinnercfc4c132013-04-03 01:48:39 +020014969 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14970 nonfmtpos, ctx.fmtpos) < 0)
14971 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 }
14973 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014974 ctx.fmtpos++;
14975 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014976 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014977 }
14978 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014979
Victor Stinnera47082312012-10-04 02:19:54 +020014980 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014981 PyErr_SetString(PyExc_TypeError,
14982 "not all arguments converted during string formatting");
14983 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014984 }
14985
Victor Stinnera47082312012-10-04 02:19:54 +020014986 if (ctx.args_owned) {
14987 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988 }
Victor Stinnera47082312012-10-04 02:19:54 +020014989 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990
Benjamin Peterson29060642009-01-31 22:14:21 +000014991 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014992 _PyUnicodeWriter_Dealloc(&ctx.writer);
14993 if (ctx.args_owned) {
14994 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014995 }
14996 return NULL;
14997}
14998
Jeremy Hylton938ace62002-07-17 16:30:39 +000014999static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015000unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15001
Tim Peters6d6c1a32001-08-02 04:15:00 +000015002static PyObject *
15003unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15004{
Benjamin Peterson29060642009-01-31 22:14:21 +000015005 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 static char *kwlist[] = {"object", "encoding", "errors", 0};
15007 char *encoding = NULL;
15008 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015009
Benjamin Peterson14339b62009-01-31 16:36:08 +000015010 if (type != &PyUnicode_Type)
15011 return unicode_subtype_new(type, args, kwds);
15012 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015013 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 return NULL;
15015 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015016 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 if (encoding == NULL && errors == NULL)
15018 return PyObject_Str(x);
15019 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015020 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015021}
15022
Guido van Rossume023fe02001-08-30 03:12:59 +000015023static PyObject *
15024unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15025{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015026 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015027 Py_ssize_t length, char_size;
15028 int share_wstr, share_utf8;
15029 unsigned int kind;
15030 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015031
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015033
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015034 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015035 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015036 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015037 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015038 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015039 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015040 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015041 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015042
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015043 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015044 if (self == NULL) {
15045 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 return NULL;
15047 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015048 kind = PyUnicode_KIND(unicode);
15049 length = PyUnicode_GET_LENGTH(unicode);
15050
15051 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015052#ifdef Py_DEBUG
15053 _PyUnicode_HASH(self) = -1;
15054#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015056#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057 _PyUnicode_STATE(self).interned = 0;
15058 _PyUnicode_STATE(self).kind = kind;
15059 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015060 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061 _PyUnicode_STATE(self).ready = 1;
15062 _PyUnicode_WSTR(self) = NULL;
15063 _PyUnicode_UTF8_LENGTH(self) = 0;
15064 _PyUnicode_UTF8(self) = NULL;
15065 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015066 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015067
15068 share_utf8 = 0;
15069 share_wstr = 0;
15070 if (kind == PyUnicode_1BYTE_KIND) {
15071 char_size = 1;
15072 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15073 share_utf8 = 1;
15074 }
15075 else if (kind == PyUnicode_2BYTE_KIND) {
15076 char_size = 2;
15077 if (sizeof(wchar_t) == 2)
15078 share_wstr = 1;
15079 }
15080 else {
15081 assert(kind == PyUnicode_4BYTE_KIND);
15082 char_size = 4;
15083 if (sizeof(wchar_t) == 4)
15084 share_wstr = 1;
15085 }
15086
15087 /* Ensure we won't overflow the length. */
15088 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15089 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015090 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015092 data = PyObject_MALLOC((length + 1) * char_size);
15093 if (data == NULL) {
15094 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015095 goto onError;
15096 }
15097
Victor Stinnerc3c74152011-10-02 20:39:55 +020015098 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015099 if (share_utf8) {
15100 _PyUnicode_UTF8_LENGTH(self) = length;
15101 _PyUnicode_UTF8(self) = data;
15102 }
15103 if (share_wstr) {
15104 _PyUnicode_WSTR_LENGTH(self) = length;
15105 _PyUnicode_WSTR(self) = (wchar_t *)data;
15106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015107
Christian Heimesf051e432016-09-13 20:22:02 +020015108 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015109 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015110 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015111#ifdef Py_DEBUG
15112 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15113#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015114 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015115 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015116
15117onError:
15118 Py_DECREF(unicode);
15119 Py_DECREF(self);
15120 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015121}
15122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015123PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015124"str(object='') -> str\n\
15125str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015126\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015127Create a new string object from the given object. If encoding or\n\
15128errors is specified, then the object must expose a data buffer\n\
15129that will be decoded using the given encoding and error handler.\n\
15130Otherwise, returns the result of object.__str__() (if defined)\n\
15131or repr(object).\n\
15132encoding defaults to sys.getdefaultencoding().\n\
15133errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015134
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015135static PyObject *unicode_iter(PyObject *seq);
15136
Guido van Rossumd57fd912000-03-10 22:53:23 +000015137PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015138 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 "str", /* tp_name */
15140 sizeof(PyUnicodeObject), /* tp_size */
15141 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015142 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 (destructor)unicode_dealloc, /* tp_dealloc */
15144 0, /* tp_print */
15145 0, /* tp_getattr */
15146 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015147 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015148 unicode_repr, /* tp_repr */
15149 &unicode_as_number, /* tp_as_number */
15150 &unicode_as_sequence, /* tp_as_sequence */
15151 &unicode_as_mapping, /* tp_as_mapping */
15152 (hashfunc) unicode_hash, /* tp_hash*/
15153 0, /* tp_call*/
15154 (reprfunc) unicode_str, /* tp_str */
15155 PyObject_GenericGetAttr, /* tp_getattro */
15156 0, /* tp_setattro */
15157 0, /* tp_as_buffer */
15158 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015159 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 unicode_doc, /* tp_doc */
15161 0, /* tp_traverse */
15162 0, /* tp_clear */
15163 PyUnicode_RichCompare, /* tp_richcompare */
15164 0, /* tp_weaklistoffset */
15165 unicode_iter, /* tp_iter */
15166 0, /* tp_iternext */
15167 unicode_methods, /* tp_methods */
15168 0, /* tp_members */
15169 0, /* tp_getset */
15170 &PyBaseObject_Type, /* tp_base */
15171 0, /* tp_dict */
15172 0, /* tp_descr_get */
15173 0, /* tp_descr_set */
15174 0, /* tp_dictoffset */
15175 0, /* tp_init */
15176 0, /* tp_alloc */
15177 unicode_new, /* tp_new */
15178 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015179};
15180
15181/* Initialize the Unicode implementation */
15182
Victor Stinner3a50e702011-10-18 21:21:00 +020015183int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015184{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015185 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015186 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015187 0x000A, /* LINE FEED */
15188 0x000D, /* CARRIAGE RETURN */
15189 0x001C, /* FILE SEPARATOR */
15190 0x001D, /* GROUP SEPARATOR */
15191 0x001E, /* RECORD SEPARATOR */
15192 0x0085, /* NEXT LINE */
15193 0x2028, /* LINE SEPARATOR */
15194 0x2029, /* PARAGRAPH SEPARATOR */
15195 };
15196
Fred Drakee4315f52000-05-09 19:53:39 +000015197 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015198 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015199 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015200 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015201 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015202
Guido van Rossumcacfc072002-05-24 19:01:59 +000015203 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015204 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015205
15206 /* initialize the linebreak bloom filter */
15207 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015208 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015209 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015210
Christian Heimes26532f72013-07-20 14:57:16 +020015211 if (PyType_Ready(&EncodingMapType) < 0)
15212 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015213
Benjamin Petersonc4311282012-10-30 23:21:10 -040015214 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15215 Py_FatalError("Can't initialize field name iterator type");
15216
15217 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15218 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015219
Victor Stinner3a50e702011-10-18 21:21:00 +020015220 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015221}
15222
15223/* Finalize the Unicode implementation */
15224
Christian Heimesa156e092008-02-16 07:38:31 +000015225int
15226PyUnicode_ClearFreeList(void)
15227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015228 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015229}
15230
Guido van Rossumd57fd912000-03-10 22:53:23 +000015231void
Thomas Wouters78890102000-07-22 19:25:51 +000015232_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015233{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015234 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015235
Serhiy Storchaka05997252013-01-26 12:14:02 +020015236 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015237
Serhiy Storchaka05997252013-01-26 12:14:02 +020015238 for (i = 0; i < 256; i++)
15239 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015240 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015241 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015242}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015243
Walter Dörwald16807132007-05-25 13:52:07 +000015244void
15245PyUnicode_InternInPlace(PyObject **p)
15246{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015247 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015249#ifdef Py_DEBUG
15250 assert(s != NULL);
15251 assert(_PyUnicode_CHECK(s));
15252#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015254 return;
15255#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 /* If it's a subclass, we don't really know what putting
15257 it in the interned dict might do. */
15258 if (!PyUnicode_CheckExact(s))
15259 return;
15260 if (PyUnicode_CHECK_INTERNED(s))
15261 return;
15262 if (interned == NULL) {
15263 interned = PyDict_New();
15264 if (interned == NULL) {
15265 PyErr_Clear(); /* Don't leave an exception */
15266 return;
15267 }
15268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015270 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015272 if (t == NULL) {
15273 PyErr_Clear();
15274 return;
15275 }
15276 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015277 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015278 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015279 return;
15280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 /* The two references in interned are not counted by refcnt.
15282 The deallocator will take care of this */
15283 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015284 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015285}
15286
15287void
15288PyUnicode_InternImmortal(PyObject **p)
15289{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 PyUnicode_InternInPlace(p);
15291 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015292 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 Py_INCREF(*p);
15294 }
Walter Dörwald16807132007-05-25 13:52:07 +000015295}
15296
15297PyObject *
15298PyUnicode_InternFromString(const char *cp)
15299{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 PyObject *s = PyUnicode_FromString(cp);
15301 if (s == NULL)
15302 return NULL;
15303 PyUnicode_InternInPlace(&s);
15304 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015305}
15306
Alexander Belopolsky40018472011-02-26 01:02:56 +000015307void
15308_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015309{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015311 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 Py_ssize_t i, n;
15313 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015314
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 if (interned == NULL || !PyDict_Check(interned))
15316 return;
15317 keys = PyDict_Keys(interned);
15318 if (keys == NULL || !PyList_Check(keys)) {
15319 PyErr_Clear();
15320 return;
15321 }
Walter Dörwald16807132007-05-25 13:52:07 +000015322
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15324 detector, interned unicode strings are not forcibly deallocated;
15325 rather, we give them their stolen references back, and then clear
15326 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015327
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 n = PyList_GET_SIZE(keys);
15329 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015330 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015332 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015333 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015334 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015336 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 case SSTATE_NOT_INTERNED:
15338 /* XXX Shouldn't happen */
15339 break;
15340 case SSTATE_INTERNED_IMMORTAL:
15341 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015342 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 break;
15344 case SSTATE_INTERNED_MORTAL:
15345 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015346 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 break;
15348 default:
15349 Py_FatalError("Inconsistent interned string state.");
15350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 }
15353 fprintf(stderr, "total size of all interned strings: "
15354 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15355 "mortal/immortal\n", mortal_size, immortal_size);
15356 Py_DECREF(keys);
15357 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015358 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015359}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015360
15361
15362/********************* Unicode Iterator **************************/
15363
15364typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 PyObject_HEAD
15366 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015367 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368} unicodeiterobject;
15369
15370static void
15371unicodeiter_dealloc(unicodeiterobject *it)
15372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 _PyObject_GC_UNTRACK(it);
15374 Py_XDECREF(it->it_seq);
15375 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015376}
15377
15378static int
15379unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15380{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 Py_VISIT(it->it_seq);
15382 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015383}
15384
15385static PyObject *
15386unicodeiter_next(unicodeiterobject *it)
15387{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015388 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015389
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 assert(it != NULL);
15391 seq = it->it_seq;
15392 if (seq == NULL)
15393 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015394 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015396 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15397 int kind = PyUnicode_KIND(seq);
15398 void *data = PyUnicode_DATA(seq);
15399 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15400 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015401 if (item != NULL)
15402 ++it->it_index;
15403 return item;
15404 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015407 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409}
15410
15411static PyObject *
15412unicodeiter_len(unicodeiterobject *it)
15413{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 Py_ssize_t len = 0;
15415 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015416 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418}
15419
15420PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15421
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015422static PyObject *
15423unicodeiter_reduce(unicodeiterobject *it)
15424{
15425 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015426 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015427 it->it_seq, it->it_index);
15428 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015429 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015430 if (u == NULL)
15431 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015432 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015433 }
15434}
15435
15436PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15437
15438static PyObject *
15439unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15440{
15441 Py_ssize_t index = PyLong_AsSsize_t(state);
15442 if (index == -1 && PyErr_Occurred())
15443 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015444 if (it->it_seq != NULL) {
15445 if (index < 0)
15446 index = 0;
15447 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15448 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15449 it->it_index = index;
15450 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 Py_RETURN_NONE;
15452}
15453
15454PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15455
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015456static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015457 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015458 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015459 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15460 reduce_doc},
15461 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15462 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015463 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015464};
15465
15466PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015467 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15468 "str_iterator", /* tp_name */
15469 sizeof(unicodeiterobject), /* tp_basicsize */
15470 0, /* tp_itemsize */
15471 /* methods */
15472 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15473 0, /* tp_print */
15474 0, /* tp_getattr */
15475 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015476 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015477 0, /* tp_repr */
15478 0, /* tp_as_number */
15479 0, /* tp_as_sequence */
15480 0, /* tp_as_mapping */
15481 0, /* tp_hash */
15482 0, /* tp_call */
15483 0, /* tp_str */
15484 PyObject_GenericGetAttr, /* tp_getattro */
15485 0, /* tp_setattro */
15486 0, /* tp_as_buffer */
15487 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15488 0, /* tp_doc */
15489 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15490 0, /* tp_clear */
15491 0, /* tp_richcompare */
15492 0, /* tp_weaklistoffset */
15493 PyObject_SelfIter, /* tp_iter */
15494 (iternextfunc)unicodeiter_next, /* tp_iternext */
15495 unicodeiter_methods, /* tp_methods */
15496 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015497};
15498
15499static PyObject *
15500unicode_iter(PyObject *seq)
15501{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015503
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 if (!PyUnicode_Check(seq)) {
15505 PyErr_BadInternalCall();
15506 return NULL;
15507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015508 if (PyUnicode_READY(seq) == -1)
15509 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15511 if (it == NULL)
15512 return NULL;
15513 it->it_index = 0;
15514 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015515 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015516 _PyObject_GC_TRACK(it);
15517 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015518}
15519
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015520
15521size_t
15522Py_UNICODE_strlen(const Py_UNICODE *u)
15523{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015524 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015525}
15526
15527Py_UNICODE*
15528Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15529{
15530 Py_UNICODE *u = s1;
15531 while ((*u++ = *s2++));
15532 return s1;
15533}
15534
15535Py_UNICODE*
15536Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15537{
15538 Py_UNICODE *u = s1;
15539 while ((*u++ = *s2++))
15540 if (n-- == 0)
15541 break;
15542 return s1;
15543}
15544
15545Py_UNICODE*
15546Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15547{
15548 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015549 u1 += wcslen(u1);
15550 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015551 return s1;
15552}
15553
15554int
15555Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15556{
15557 while (*s1 && *s2 && *s1 == *s2)
15558 s1++, s2++;
15559 if (*s1 && *s2)
15560 return (*s1 < *s2) ? -1 : +1;
15561 if (*s1)
15562 return 1;
15563 if (*s2)
15564 return -1;
15565 return 0;
15566}
15567
15568int
15569Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15570{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015571 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015572 for (; n != 0; n--) {
15573 u1 = *s1;
15574 u2 = *s2;
15575 if (u1 != u2)
15576 return (u1 < u2) ? -1 : +1;
15577 if (u1 == '\0')
15578 return 0;
15579 s1++;
15580 s2++;
15581 }
15582 return 0;
15583}
15584
15585Py_UNICODE*
15586Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15587{
15588 const Py_UNICODE *p;
15589 for (p = s; *p; p++)
15590 if (*p == c)
15591 return (Py_UNICODE*)p;
15592 return NULL;
15593}
15594
15595Py_UNICODE*
15596Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15597{
15598 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015599 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015600 while (p != s) {
15601 p--;
15602 if (*p == c)
15603 return (Py_UNICODE*)p;
15604 }
15605 return NULL;
15606}
Victor Stinner331ea922010-08-10 16:37:20 +000015607
Victor Stinner71133ff2010-09-01 23:43:53 +000015608Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015609PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015610{
Victor Stinner577db2c2011-10-11 22:12:48 +020015611 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015612 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015614 if (!PyUnicode_Check(unicode)) {
15615 PyErr_BadArgument();
15616 return NULL;
15617 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015618 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015619 if (u == NULL)
15620 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015621 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015622 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015623 PyErr_NoMemory();
15624 return NULL;
15625 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015626 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015627 size *= sizeof(Py_UNICODE);
15628 copy = PyMem_Malloc(size);
15629 if (copy == NULL) {
15630 PyErr_NoMemory();
15631 return NULL;
15632 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015633 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015634 return copy;
15635}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015636
Georg Brandl66c221e2010-10-14 07:04:07 +000015637/* A _string module, to export formatter_parser and formatter_field_name_split
15638 to the string.Formatter class implemented in Python. */
15639
15640static PyMethodDef _string_methods[] = {
15641 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15642 METH_O, PyDoc_STR("split the argument as a field name")},
15643 {"formatter_parser", (PyCFunction) formatter_parser,
15644 METH_O, PyDoc_STR("parse the argument as a format string")},
15645 {NULL, NULL}
15646};
15647
15648static struct PyModuleDef _string_module = {
15649 PyModuleDef_HEAD_INIT,
15650 "_string",
15651 PyDoc_STR("string helper module"),
15652 0,
15653 _string_methods,
15654 NULL,
15655 NULL,
15656 NULL,
15657 NULL
15658};
15659
15660PyMODINIT_FUNC
15661PyInit__string(void)
15662{
15663 return PyModule_Create(&_string_module);
15664}
15665
15666
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015667#ifdef __cplusplus
15668}
15669#endif